From 7f27d55862d3177dccde35f3b5711bc79b80ac1e Mon Sep 17 00:00:00 2001
From: Andrey Maslennikov <andreyma@nvidia.com>
Date: Wed, 6 Nov 2024 12:19:21 +0200
Subject: [PATCH 01/21] Installation works

---
 conf/new/test/nemo-vfm.toml                   | 22 +++++++++++++++++
 src/cloudai/__init__.py                       |  4 ++++
 .../generic_slurm_container/template.py       | 21 ++++++++++++++++
 .../generic_slurm_container.py                | 24 +++++++++++++++++++
 4 files changed, 71 insertions(+)
 create mode 100644 conf/new/test/nemo-vfm.toml
 create mode 100644 src/cloudai/schema/test_template/generic_slurm_container/template.py
 create mode 100644 src/cloudai/test_definitions/generic_slurm_container.py

diff --git a/conf/new/test/nemo-vfm.toml b/conf/new/test/nemo-vfm.toml
new file mode 100644
index 00000000..0cafe628
--- /dev/null
+++ b/conf/new/test/nemo-vfm.toml
@@ -0,0 +1,22 @@
+# SPDX-FileCopyrightText: NVIDIA CORPORATION & AFFILIATES
+# Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+name = "nemo-vfm"
+description = "Nemo VFM"
+test_template_name = "GenericSlurmContainer"
+
+[cmd_args]
+docker_image_url = "gitlab-master.nvidia.com/dl/nemo/nemo-vfm:24.07.training.2"
diff --git a/src/cloudai/__init__.py b/src/cloudai/__init__.py
index fd394f24..e5d1c48a 100644
--- a/src/cloudai/__init__.py
+++ b/src/cloudai/__init__.py
@@ -57,6 +57,7 @@
 from .schema.test_template.common.default_job_status_retrieval_strategy import DefaultJobStatusRetrievalStrategy
 from .schema.test_template.common.slurm_job_id_retrieval_strategy import SlurmJobIdRetrievalStrategy
 from .schema.test_template.common.standalone_job_id_retrieval_strategy import StandaloneJobIdRetrievalStrategy
+from .schema.test_template.generic_slurm_container.template import GenericSlurmContainerTT
 from .schema.test_template.jax_toolbox.grading_strategy import JaxToolboxGradingStrategy
 from .schema.test_template.jax_toolbox.job_status_retrieval_strategy import JaxToolboxJobStatusRetrievalStrategy
 from .schema.test_template.jax_toolbox.report_generation_strategy import JaxToolboxReportGenerationStrategy
@@ -98,6 +99,7 @@
     SleepTestDefinition,
     UCCTestDefinition,
 )
+from .test_definitions.generic_slurm_container import SlurmContainerTestDefinition
 
 Registry().add_runner("slurm", SlurmRunner)
 Registry().add_runner("kubernetes", KubernetesRunner)
@@ -165,6 +167,7 @@
 Registry().add_test_definition("JaxToolboxGPT", GPTTestDefinition)
 Registry().add_test_definition("JaxToolboxGrok", GrokTestDefinition)
 Registry().add_test_definition("JaxToolboxNemotron", NemotronTestDefinition)
+Registry().add_test_definition("GenericSlurmContainer", SlurmContainerTestDefinition)
 
 Registry().add_test_template("ChakraReplay", ChakraReplay)
 Registry().add_test_template("NcclTest", NcclTest)
@@ -174,6 +177,7 @@
 Registry().add_test_template("JaxToolboxGPT", JaxToolbox)
 Registry().add_test_template("JaxToolboxGrok", JaxToolbox)
 Registry().add_test_template("JaxToolboxNemotron", JaxToolbox)
+Registry().add_test_template("GenericSlurmContainer", GenericSlurmContainerTT)
 
 __all__ = [
     "BaseInstaller",
diff --git a/src/cloudai/schema/test_template/generic_slurm_container/template.py b/src/cloudai/schema/test_template/generic_slurm_container/template.py
new file mode 100644
index 00000000..30d94048
--- /dev/null
+++ b/src/cloudai/schema/test_template/generic_slurm_container/template.py
@@ -0,0 +1,21 @@
+# SPDX-FileCopyrightText: NVIDIA CORPORATION & AFFILIATES
+# Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from cloudai import TestTemplate
+
+
+class GenericSlurmContainerTT(TestTemplate):
+    pass
diff --git a/src/cloudai/test_definitions/generic_slurm_container.py b/src/cloudai/test_definitions/generic_slurm_container.py
new file mode 100644
index 00000000..b8d86196
--- /dev/null
+++ b/src/cloudai/test_definitions/generic_slurm_container.py
@@ -0,0 +1,24 @@
+from typing import Optional
+
+from cloudai import CmdArgs, Installable, TestDefinition
+from cloudai.installer.installables import DockerImage
+
+
+class SlurmContainerCmdArgs(CmdArgs):
+    docker_image_url: str
+
+
+class SlurmContainerTestDefinition(TestDefinition):
+    cmd_args: SlurmContainerCmdArgs
+
+    _docker_image: Optional[DockerImage] = None
+
+    @property
+    def docker_image(self) -> DockerImage:
+        if not self._docker_image:
+            self._docker_image = DockerImage(url=self.cmd_args.docker_image_url)
+        return self._docker_image
+
+    @property
+    def installables(self) -> list[Installable]:
+        return [self.docker_image]

From 9c4611df6e32f2ad43e39fd04018641296b9682e Mon Sep 17 00:00:00 2001
From: Andrey Maslennikov <andreyma@nvidia.com>
Date: Wed, 6 Nov 2024 12:25:27 +0200
Subject: [PATCH 02/21] dry-run works, but doesn't generate anything

---
 conf/new/test_scenario/nemo-vfm.toml          | 22 +++++++++++++++++++
 src/cloudai/__init__.py                       |  6 +++++
 .../slurm_command_gen_strategy.py             | 21 ++++++++++++++++++
 3 files changed, 49 insertions(+)
 create mode 100644 conf/new/test_scenario/nemo-vfm.toml
 create mode 100644 src/cloudai/schema/test_template/generic_slurm_container/slurm_command_gen_strategy.py

diff --git a/conf/new/test_scenario/nemo-vfm.toml b/conf/new/test_scenario/nemo-vfm.toml
new file mode 100644
index 00000000..5bc15157
--- /dev/null
+++ b/conf/new/test_scenario/nemo-vfm.toml
@@ -0,0 +1,22 @@
+# SPDX-FileCopyrightText: NVIDIA CORPORATION & AFFILIATES
+# Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+name = "nemo-vfm"
+
+[[Tests]]
+id = "Tests.1"
+test_name = "nemo-vfm"
+num_nodes = "2"
diff --git a/src/cloudai/__init__.py b/src/cloudai/__init__.py
index e5d1c48a..33de7111 100644
--- a/src/cloudai/__init__.py
+++ b/src/cloudai/__init__.py
@@ -57,6 +57,9 @@
 from .schema.test_template.common.default_job_status_retrieval_strategy import DefaultJobStatusRetrievalStrategy
 from .schema.test_template.common.slurm_job_id_retrieval_strategy import SlurmJobIdRetrievalStrategy
 from .schema.test_template.common.standalone_job_id_retrieval_strategy import StandaloneJobIdRetrievalStrategy
+from .schema.test_template.generic_slurm_container.slurm_command_gen_strategy import (
+    GenericSlurmContainerCommandGenStrategy,
+)
 from .schema.test_template.generic_slurm_container.template import GenericSlurmContainerTT
 from .schema.test_template.jax_toolbox.grading_strategy import JaxToolboxGradingStrategy
 from .schema.test_template.jax_toolbox.job_status_retrieval_strategy import JaxToolboxJobStatusRetrievalStrategy
@@ -150,6 +153,9 @@
 Registry().add_strategy(ReportGenerationStrategy, [SlurmSystem], [ChakraReplay], ChakraReplayReportGenerationStrategy)
 Registry().add_strategy(GradingStrategy, [SlurmSystem], [ChakraReplay], ChakraReplayGradingStrategy)
 Registry().add_strategy(CommandGenStrategy, [SlurmSystem], [ChakraReplay], ChakraReplaySlurmCommandGenStrategy)
+Registry().add_strategy(
+    CommandGenStrategy, [SlurmSystem], [GenericSlurmContainerTT], GenericSlurmContainerCommandGenStrategy
+)
 
 Registry().add_installer("slurm", SlurmInstaller)
 Registry().add_installer("standalone", StandaloneInstaller)
diff --git a/src/cloudai/schema/test_template/generic_slurm_container/slurm_command_gen_strategy.py b/src/cloudai/schema/test_template/generic_slurm_container/slurm_command_gen_strategy.py
new file mode 100644
index 00000000..acd7f95a
--- /dev/null
+++ b/src/cloudai/schema/test_template/generic_slurm_container/slurm_command_gen_strategy.py
@@ -0,0 +1,21 @@
+# SPDX-FileCopyrightText: NVIDIA CORPORATION & AFFILIATES
+# Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from cloudai.systems.slurm.strategy import SlurmCommandGenStrategy
+
+
+class GenericSlurmContainerCommandGenStrategy(SlurmCommandGenStrategy):
+    pass

From af3df566ba4a28691da340bda7f12569a036cbcc Mon Sep 17 00:00:00 2001
From: Andrey Maslennikov <andreyma@nvidia.com>
Date: Wed, 6 Nov 2024 14:24:39 +0200
Subject: [PATCH 03/21] Install repo and prepare for real runs

---
 conf/new/test/nemo-vfm.toml                   | 14 +++++++++++
 src/cloudai/__init__.py                       |  4 ++--
 src/cloudai/installer/slurm_installer.py      | 10 ++++++++
 .../slurm_command_gen_strategy.py             | 20 +++++++++++++++-
 .../generic_slurm_container.py                | 23 +++++++++++++++++--
 5 files changed, 66 insertions(+), 5 deletions(-)

diff --git a/conf/new/test/nemo-vfm.toml b/conf/new/test/nemo-vfm.toml
index 0cafe628..358d2e6e 100644
--- a/conf/new/test/nemo-vfm.toml
+++ b/conf/new/test/nemo-vfm.toml
@@ -20,3 +20,17 @@ test_template_name = "GenericSlurmContainer"
 
 [cmd_args]
 docker_image_url = "gitlab-master.nvidia.com/dl/nemo/nemo-vfm:24.07.training.2"
+# docker_image_url = "gitlab-master.nvidia.com/dl/nemo/nemo-vfm:24.10"
+repository_url = "ssh://git@gitlab-master.nvidia.com:12051/dl/nemo/nemo-vfm.git"
+repository_commit_hash = "f7c546022acca7cf818ec88398f408f53b012586"
+
+[extra_cmd_args]
+"bash" = '-c "pwd ; whoami ; torchrun --help"'
+# "bash" = '-c "cd ${DIR} ; python -u nemo/collections/multimodal/vfm/train.py --yes $*"'
+
+[extra_env_vars]
+"WANDB_PROJECT" = "vfm"
+"WANDB_RESUME" = "allow"
+"NVTE_FUSED_ATTN" = "0 "
+"CUDA_DEVICE_MAX_CONNECTIONS" = "1"
+"PYTORCH_CUDA_ALLOC_CONF" = "expandable_segments:True"
diff --git a/src/cloudai/__init__.py b/src/cloudai/__init__.py
index 33de7111..3fdee20d 100644
--- a/src/cloudai/__init__.py
+++ b/src/cloudai/__init__.py
@@ -133,7 +133,7 @@
 Registry().add_strategy(
     JobIdRetrievalStrategy,
     [SlurmSystem],
-    [ChakraReplay, JaxToolbox, NcclTest, UCCTest, Sleep],
+    [ChakraReplay, JaxToolbox, NcclTest, UCCTest, Sleep, GenericSlurmContainerTT],
     SlurmJobIdRetrievalStrategy,
 )
 Registry().add_strategy(JobIdRetrievalStrategy, [StandaloneSystem], [Sleep], StandaloneJobIdRetrievalStrategy)
@@ -146,7 +146,7 @@
 Registry().add_strategy(
     JobStatusRetrievalStrategy,
     [SlurmSystem],
-    [ChakraReplay, UCCTest, NeMoLauncher, Sleep],
+    [ChakraReplay, UCCTest, NeMoLauncher, Sleep, GenericSlurmContainerTT],
     DefaultJobStatusRetrievalStrategy,
 )
 Registry().add_strategy(CommandGenStrategy, [SlurmSystem], [UCCTest], UCCTestSlurmCommandGenStrategy)
diff --git a/src/cloudai/installer/slurm_installer.py b/src/cloudai/installer/slurm_installer.py
index f7904381..638a8efc 100644
--- a/src/cloudai/installer/slurm_installer.py
+++ b/src/cloudai/installer/slurm_installer.py
@@ -118,6 +118,8 @@ def install_one(self, item: Installable) -> InstallStatusResult:
         if isinstance(item, DockerImage):
             res = self._install_docker_image(item)
             return InstallStatusResult(res.success, res.message)
+        elif isinstance(item, GitRepo):
+            return self._install_one_git_repo(item)
         elif isinstance(item, PythonExecutable):
             return self._install_python_executable(item)
 
@@ -139,6 +141,8 @@ def uninstall_one(self, item: Installable) -> InstallStatusResult:
             return InstallStatusResult(res.success, res.message)
         elif isinstance(item, PythonExecutable):
             return self._uninstall_python_executable(item)
+        elif isinstance(item, GitRepo):
+            return self._uninstall_git_repo(item)
 
         return InstallStatusResult(False, f"Unsupported item type: {type(item)}")
 
@@ -148,6 +152,12 @@ def is_installed_one(self, item: Installable) -> InstallStatusResult:
             if res.success and res.docker_image_path:
                 item.installed_path = res.docker_image_path
             return InstallStatusResult(res.success, res.message)
+        elif isinstance(item, GitRepo):
+            repo_path = item.installed_path if item.installed_path else self.system.install_path / item.repo_name
+            if repo_path.exists():
+                item.installed_path = repo_path
+                return InstallStatusResult(True)
+            return InstallStatusResult(False, f"Git repository {item.git_url} not cloned")
         elif isinstance(item, PythonExecutable):
             return self._is_python_executable_installed(item)
 
diff --git a/src/cloudai/schema/test_template/generic_slurm_container/slurm_command_gen_strategy.py b/src/cloudai/schema/test_template/generic_slurm_container/slurm_command_gen_strategy.py
index acd7f95a..332f5606 100644
--- a/src/cloudai/schema/test_template/generic_slurm_container/slurm_command_gen_strategy.py
+++ b/src/cloudai/schema/test_template/generic_slurm_container/slurm_command_gen_strategy.py
@@ -14,8 +14,26 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
+from typing import Any, cast
+
+from cloudai import TestRun
 from cloudai.systems.slurm.strategy import SlurmCommandGenStrategy
+from cloudai.test_definitions.generic_slurm_container import SlurmContainerTestDefinition
 
 
 class GenericSlurmContainerCommandGenStrategy(SlurmCommandGenStrategy):
-    pass
+    def generate_srun_prefix(self, slurm_args: dict[str, Any], tr: TestRun) -> list[str]:
+        tdef: SlurmContainerTestDefinition = cast(SlurmContainerTestDefinition, tr.test.test_definition)
+        slurm_args["image_path"] = tdef.docker_image.installed_path
+        # slurm_args["container_mounts"] = ""   # TBD
+        cmd = super().generate_srun_prefix(slurm_args, tr)
+
+        # cmd = ["pip", "install", "-e", ".", "\n"] + cmd
+        return cmd
+
+    def generate_test_command(self, env_vars: dict[str, str], cmd_args: dict[str, str], tr: TestRun) -> list[str]:
+        srun_command_parts: list[str] = []
+        if tr.test.extra_cmd_args:
+            srun_command_parts.append(tr.test.extra_cmd_args)
+
+        return srun_command_parts
diff --git a/src/cloudai/test_definitions/generic_slurm_container.py b/src/cloudai/test_definitions/generic_slurm_container.py
index b8d86196..002a7d00 100644
--- a/src/cloudai/test_definitions/generic_slurm_container.py
+++ b/src/cloudai/test_definitions/generic_slurm_container.py
@@ -1,17 +1,20 @@
 from typing import Optional
 
 from cloudai import CmdArgs, Installable, TestDefinition
-from cloudai.installer.installables import DockerImage
+from cloudai.installer.installables import DockerImage, GitRepo
 
 
 class SlurmContainerCmdArgs(CmdArgs):
     docker_image_url: str
+    repository_url: str
+    repository_commit_hash: str
 
 
 class SlurmContainerTestDefinition(TestDefinition):
     cmd_args: SlurmContainerCmdArgs
 
     _docker_image: Optional[DockerImage] = None
+    _git_repo: Optional[GitRepo] = None
 
     @property
     def docker_image(self) -> DockerImage:
@@ -19,6 +22,22 @@ def docker_image(self) -> DockerImage:
             self._docker_image = DockerImage(url=self.cmd_args.docker_image_url)
         return self._docker_image
 
+    @property
+    def git_repo(self) -> GitRepo:
+        if not self._git_repo:
+            self._python_executable = GitRepo(
+                git_url=self.cmd_args.repository_url, commit_hash=self.cmd_args.repository_commit_hash
+            )
+
+        return self._python_executable
+
     @property
     def installables(self) -> list[Installable]:
-        return [self.docker_image]
+        return [self.docker_image, self.git_repo]
+
+    @property
+    def extra_args_str(self) -> str:
+        parts = []
+        for k, v in self.extra_cmd_args.items():
+            parts.append(f"{k} {v}" if v else k)
+        return " ".join(parts)

From cef9c159333da1bae94c46d0c34cc74f1f95ddff Mon Sep 17 00:00:00 2001
From: Andrey Maslennikov <andreyma@nvidia.com>
Date: Wed, 6 Nov 2024 15:28:48 +0200
Subject: [PATCH 04/21] Finalize single node run test

---
 conf/new/test/nemo-vfm.toml                          | 12 +++++-------
 conf/new/test_scenario/nemo-vfm.toml                 |  4 ++--
 src/cloudai/installer/slurm_installer.py             |  2 +-
 src/cloudai/runner/slurm/slurm_runner.py             |  1 +
 .../slurm_command_gen_strategy.py                    |  5 ++---
 .../test_definitions/generic_slurm_container.py      |  4 ++--
 6 files changed, 13 insertions(+), 15 deletions(-)

diff --git a/conf/new/test/nemo-vfm.toml b/conf/new/test/nemo-vfm.toml
index 358d2e6e..3d82f495 100644
--- a/conf/new/test/nemo-vfm.toml
+++ b/conf/new/test/nemo-vfm.toml
@@ -14,23 +14,21 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-name = "nemo-vfm"
-description = "Nemo VFM"
+name = "nemo-vfm-single"
+description = "Nemo VFM for single node"
 test_template_name = "GenericSlurmContainer"
 
 [cmd_args]
-docker_image_url = "gitlab-master.nvidia.com/dl/nemo/nemo-vfm:24.07.training.2"
-# docker_image_url = "gitlab-master.nvidia.com/dl/nemo/nemo-vfm:24.10"
+docker_image_url = "gitlab-master.nvidia.com/dl/nemo/nemo-vfm:24.10"
 repository_url = "ssh://git@gitlab-master.nvidia.com:12051/dl/nemo/nemo-vfm.git"
 repository_commit_hash = "f7c546022acca7cf818ec88398f408f53b012586"
 
 [extra_cmd_args]
-"bash" = '-c "pwd ; whoami ; torchrun --help"'
-# "bash" = '-c "cd ${DIR} ; python -u nemo/collections/multimodal/vfm/train.py --yes $*"'
+"bash" = '-c "cd /work ; pwd ; ls -l ; pip install -e . ; torchrun --nproc-per-node=8 nemo/collections/diffusion/train.py --yes --factory train_mock trainer.strategy.tensor_model_parallel_size=1 trainer.strategy.context_parallel_size=1 trainer.strategy.sequence_parallel=False model.config.num_layers=1 data.global_batch_size=8 model.config.hidden_size=1024 data.seq_length=1024 data.task_encoder.seq_length=1024 model.config.num_layers=1"'
 
 [extra_env_vars]
 "WANDB_PROJECT" = "vfm"
 "WANDB_RESUME" = "allow"
-"NVTE_FUSED_ATTN" = "0 "
+"NVTE_FUSED_ATTN" = "0"
 "CUDA_DEVICE_MAX_CONNECTIONS" = "1"
 "PYTORCH_CUDA_ALLOC_CONF" = "expandable_segments:True"
diff --git a/conf/new/test_scenario/nemo-vfm.toml b/conf/new/test_scenario/nemo-vfm.toml
index 5bc15157..8daea380 100644
--- a/conf/new/test_scenario/nemo-vfm.toml
+++ b/conf/new/test_scenario/nemo-vfm.toml
@@ -18,5 +18,5 @@ name = "nemo-vfm"
 
 [[Tests]]
 id = "Tests.1"
-test_name = "nemo-vfm"
-num_nodes = "2"
+test_name = "nemo-vfm-single"
+num_nodes = 1
diff --git a/src/cloudai/installer/slurm_installer.py b/src/cloudai/installer/slurm_installer.py
index 638a8efc..8d542f28 100644
--- a/src/cloudai/installer/slurm_installer.py
+++ b/src/cloudai/installer/slurm_installer.py
@@ -153,7 +153,7 @@ def is_installed_one(self, item: Installable) -> InstallStatusResult:
                 item.installed_path = res.docker_image_path
             return InstallStatusResult(res.success, res.message)
         elif isinstance(item, GitRepo):
-            repo_path = item.installed_path if item.installed_path else self.system.install_path / item.repo_name
+            repo_path = self.system.install_path / item.repo_name
             if repo_path.exists():
                 item.installed_path = repo_path
                 return InstallStatusResult(True)
diff --git a/src/cloudai/runner/slurm/slurm_runner.py b/src/cloudai/runner/slurm/slurm_runner.py
index 3023e9d0..3726e252 100644
--- a/src/cloudai/runner/slurm/slurm_runner.py
+++ b/src/cloudai/runner/slurm/slurm_runner.py
@@ -68,4 +68,5 @@ def _submit_test(self, tr: TestRun) -> SlurmJob:
                     stderr=stderr,
                     message="Failed to retrieve job ID from command output.",
                 )
+            logging.info(f"Submitted slurm job: {job_id}")
         return SlurmJob(self.mode, self.system, tr, job_id)
diff --git a/src/cloudai/schema/test_template/generic_slurm_container/slurm_command_gen_strategy.py b/src/cloudai/schema/test_template/generic_slurm_container/slurm_command_gen_strategy.py
index 332f5606..4e06c8a8 100644
--- a/src/cloudai/schema/test_template/generic_slurm_container/slurm_command_gen_strategy.py
+++ b/src/cloudai/schema/test_template/generic_slurm_container/slurm_command_gen_strategy.py
@@ -25,10 +25,9 @@ class GenericSlurmContainerCommandGenStrategy(SlurmCommandGenStrategy):
     def generate_srun_prefix(self, slurm_args: dict[str, Any], tr: TestRun) -> list[str]:
         tdef: SlurmContainerTestDefinition = cast(SlurmContainerTestDefinition, tr.test.test_definition)
         slurm_args["image_path"] = tdef.docker_image.installed_path
-        # slurm_args["container_mounts"] = ""   # TBD
-        cmd = super().generate_srun_prefix(slurm_args, tr)
+        slurm_args["container_mounts"] = f"{tdef.git_repo.installed_path.absolute()}:/work"
 
-        # cmd = ["pip", "install", "-e", ".", "\n"] + cmd
+        cmd = super().generate_srun_prefix(slurm_args, tr)
         return cmd
 
     def generate_test_command(self, env_vars: dict[str, str], cmd_args: dict[str, str], tr: TestRun) -> list[str]:
diff --git a/src/cloudai/test_definitions/generic_slurm_container.py b/src/cloudai/test_definitions/generic_slurm_container.py
index 002a7d00..b7edaea2 100644
--- a/src/cloudai/test_definitions/generic_slurm_container.py
+++ b/src/cloudai/test_definitions/generic_slurm_container.py
@@ -25,11 +25,11 @@ def docker_image(self) -> DockerImage:
     @property
     def git_repo(self) -> GitRepo:
         if not self._git_repo:
-            self._python_executable = GitRepo(
+            self._git_repo = GitRepo(
                 git_url=self.cmd_args.repository_url, commit_hash=self.cmd_args.repository_commit_hash
             )
 
-        return self._python_executable
+        return self._git_repo
 
     @property
     def installables(self) -> list[Installable]:

From 465a08acf37d455ae72aebd4b770774b9fa8101d Mon Sep 17 00:00:00 2001
From: Andrey Maslennikov <andreyma@nvidia.com>
Date: Thu, 7 Nov 2024 10:49:08 +0200
Subject: [PATCH 05/21] Use abs path

---
 conf/new/test/{nemo-vfm.toml => nemo-vfm-single.toml}        | 2 +-
 conf/new/test_scenario/nemo-vfm.toml                         | 5 +++++
 .../generic_slurm_container/slurm_command_gen_strategy.py    | 4 +++-
 3 files changed, 9 insertions(+), 2 deletions(-)
 rename conf/new/test/{nemo-vfm.toml => nemo-vfm-single.toml} (95%)

diff --git a/conf/new/test/nemo-vfm.toml b/conf/new/test/nemo-vfm-single.toml
similarity index 95%
rename from conf/new/test/nemo-vfm.toml
rename to conf/new/test/nemo-vfm-single.toml
index 3d82f495..a9f36806 100644
--- a/conf/new/test/nemo-vfm.toml
+++ b/conf/new/test/nemo-vfm-single.toml
@@ -24,7 +24,7 @@ repository_url = "ssh://git@gitlab-master.nvidia.com:12051/dl/nemo/nemo-vfm.git"
 repository_commit_hash = "f7c546022acca7cf818ec88398f408f53b012586"
 
 [extra_cmd_args]
-"bash" = '-c "cd /work ; pwd ; ls -l ; pip install -e . ; torchrun --nproc-per-node=8 nemo/collections/diffusion/train.py --yes --factory train_mock trainer.strategy.tensor_model_parallel_size=1 trainer.strategy.context_parallel_size=1 trainer.strategy.sequence_parallel=False model.config.num_layers=1 data.global_batch_size=8 model.config.hidden_size=1024 data.seq_length=1024 data.task_encoder.seq_length=1024 model.config.num_layers=1"'
+"bash" = '-c "cd /work ; pwd ; ls -l ; pip install -e . ; torchrun --nproc-per-node=8 nemo/collections/diffusion/train.py --yes --factory train_mock trainer.strategy.tensor_model_parallel_size=1 trainer.strategy.context_parallel_size=1 trainer.strategy.sequence_parallel=False model.config.num_layers=1 data.global_batch_size=8 model.config.hidden_size=1024 data.seq_length=1024 data.task_encoder.seq_length=1024 model.config.num_layers=1 trainer.max_steps=500"'
 
 [extra_env_vars]
 "WANDB_PROJECT" = "vfm"
diff --git a/conf/new/test_scenario/nemo-vfm.toml b/conf/new/test_scenario/nemo-vfm.toml
index 8daea380..ecab0e53 100644
--- a/conf/new/test_scenario/nemo-vfm.toml
+++ b/conf/new/test_scenario/nemo-vfm.toml
@@ -20,3 +20,8 @@ name = "nemo-vfm"
 id = "Tests.1"
 test_name = "nemo-vfm-single"
 num_nodes = 1
+
+[[Tests]]
+id = "Tests.2"
+test_name = "nemo-vfm-single"
+num_nodes = 2
diff --git a/src/cloudai/schema/test_template/generic_slurm_container/slurm_command_gen_strategy.py b/src/cloudai/schema/test_template/generic_slurm_container/slurm_command_gen_strategy.py
index 4e06c8a8..470f5ab4 100644
--- a/src/cloudai/schema/test_template/generic_slurm_container/slurm_command_gen_strategy.py
+++ b/src/cloudai/schema/test_template/generic_slurm_container/slurm_command_gen_strategy.py
@@ -14,6 +14,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
+from pathlib import Path
 from typing import Any, cast
 
 from cloudai import TestRun
@@ -25,7 +26,8 @@ class GenericSlurmContainerCommandGenStrategy(SlurmCommandGenStrategy):
     def generate_srun_prefix(self, slurm_args: dict[str, Any], tr: TestRun) -> list[str]:
         tdef: SlurmContainerTestDefinition = cast(SlurmContainerTestDefinition, tr.test.test_definition)
         slurm_args["image_path"] = tdef.docker_image.installed_path
-        slurm_args["container_mounts"] = f"{tdef.git_repo.installed_path.absolute()}:/work"
+        repo_path = tdef.git_repo.installed_path or Path.cwd()
+        slurm_args["container_mounts"] = f"{repo_path.absolute()}:/work"
 
         cmd = super().generate_srun_prefix(slurm_args, tr)
         return cmd

From e73eb98ee96bb310c058f5a8405a05ea18e4001e Mon Sep 17 00:00:00 2001
From: Andrei Maslennikov <andreyma@nvidia.com>
Date: Thu, 7 Nov 2024 06:11:37 -0800
Subject: [PATCH 06/21] Update for nemo-vfm training

---
 conf/new/test/nemo-vfm-single.toml                 |  8 +++++---
 conf/new/test_scenario/nemo-vfm.toml               | 10 ++++++----
 .../slurm_command_gen_strategy.py                  |  5 +++--
 .../test_definitions/generic_slurm_container.py    | 14 +++++++++++++-
 4 files changed, 27 insertions(+), 10 deletions(-)

diff --git a/conf/new/test/nemo-vfm-single.toml b/conf/new/test/nemo-vfm-single.toml
index a9f36806..ba10a5e4 100644
--- a/conf/new/test/nemo-vfm-single.toml
+++ b/conf/new/test/nemo-vfm-single.toml
@@ -19,12 +19,14 @@ description = "Nemo VFM for single node"
 test_template_name = "GenericSlurmContainer"
 
 [cmd_args]
-docker_image_url = "gitlab-master.nvidia.com/dl/nemo/nemo-vfm:24.10"
+docker_image_url = "gitlab-master.nvidia.com/dl/nemo/nemo-vfm:24.07.training.2"
 repository_url = "ssh://git@gitlab-master.nvidia.com:12051/dl/nemo/nemo-vfm.git"
-repository_commit_hash = "f7c546022acca7cf818ec88398f408f53b012586"
+repository_commit_hash = "98d67aaf64fa74811483a30abf354ba75b0ba416"               # benchmark branch
+mcore_vfm_repo = "ssh://git@gitlab-master.nvidia.com:12051/dl/nemo/mcore-vfm.git"
+mcore_vfm_commit_hash = "7e9490ad83439a2db96a4af557aed32a9ce72ef7"                # main branch
 
 [extra_cmd_args]
-"bash" = '-c "cd /work ; pwd ; ls -l ; pip install -e . ; torchrun --nproc-per-node=8 nemo/collections/diffusion/train.py --yes --factory train_mock trainer.strategy.tensor_model_parallel_size=1 trainer.strategy.context_parallel_size=1 trainer.strategy.sequence_parallel=False model.config.num_layers=1 data.global_batch_size=8 model.config.hidden_size=1024 data.seq_length=1024 data.task_encoder.seq_length=1024 model.config.num_layers=1 trainer.max_steps=500"'
+"bash" = '-c "cd /work ; pwd ; ls ; git log -1 ; python -u nemo/collections/multimodal/vfm/train.py --yes --factory mock_ditllama28b_8k trainer.strategy.tensor_model_parallel_size=1 trainer.strategy.sequence_parallel=False trainer.strategy.context_parallel_size=1 model.config.recompute_granularity=full model.config.recompute_method=uniform model.config.recompute_num_layers=1 data.micro_batch_size=1 data.global_batch_size=256 data.seq_length=65536 data.task_encoder.seq_length=65536"'
 
 [extra_env_vars]
 "WANDB_PROJECT" = "vfm"
diff --git a/conf/new/test_scenario/nemo-vfm.toml b/conf/new/test_scenario/nemo-vfm.toml
index ecab0e53..1ba879d5 100644
--- a/conf/new/test_scenario/nemo-vfm.toml
+++ b/conf/new/test_scenario/nemo-vfm.toml
@@ -20,8 +20,10 @@ name = "nemo-vfm"
 id = "Tests.1"
 test_name = "nemo-vfm-single"
 num_nodes = 1
+time_limit = "01:00:00"
 
-[[Tests]]
-id = "Tests.2"
-test_name = "nemo-vfm-single"
-num_nodes = 2
+# [[Tests]]
+# id = "Tests.2"
+# test_name = "nemo-vfm-single"
+# num_nodes = 2
+# time_limit = "01:00:00"
diff --git a/src/cloudai/schema/test_template/generic_slurm_container/slurm_command_gen_strategy.py b/src/cloudai/schema/test_template/generic_slurm_container/slurm_command_gen_strategy.py
index 470f5ab4..304e14ee 100644
--- a/src/cloudai/schema/test_template/generic_slurm_container/slurm_command_gen_strategy.py
+++ b/src/cloudai/schema/test_template/generic_slurm_container/slurm_command_gen_strategy.py
@@ -27,10 +27,11 @@ def generate_srun_prefix(self, slurm_args: dict[str, Any], tr: TestRun) -> list[
         tdef: SlurmContainerTestDefinition = cast(SlurmContainerTestDefinition, tr.test.test_definition)
         slurm_args["image_path"] = tdef.docker_image.installed_path
         repo_path = tdef.git_repo.installed_path or Path.cwd()
-        slurm_args["container_mounts"] = f"{repo_path.absolute()}:/work"
+        mcore_vfm_path = tdef.mcore_vfm_git_repo.installed_path or Path.cwd()
+        slurm_args["container_mounts"] = f"{repo_path.absolute()}:/work,/lustre:/lustre/,{mcore_vfm_path.absolute()}:/opt/megatron-lm"
 
         cmd = super().generate_srun_prefix(slurm_args, tr)
-        return cmd
+        return cmd + ["--no-container-mount-home"]
 
     def generate_test_command(self, env_vars: dict[str, str], cmd_args: dict[str, str], tr: TestRun) -> list[str]:
         srun_command_parts: list[str] = []
diff --git a/src/cloudai/test_definitions/generic_slurm_container.py b/src/cloudai/test_definitions/generic_slurm_container.py
index b7edaea2..5ec099e8 100644
--- a/src/cloudai/test_definitions/generic_slurm_container.py
+++ b/src/cloudai/test_definitions/generic_slurm_container.py
@@ -8,6 +8,8 @@ class SlurmContainerCmdArgs(CmdArgs):
     docker_image_url: str
     repository_url: str
     repository_commit_hash: str
+    mcore_vfm_repo: str
+    mcore_vfm_commit_hash: str
 
 
 class SlurmContainerTestDefinition(TestDefinition):
@@ -15,6 +17,7 @@ class SlurmContainerTestDefinition(TestDefinition):
 
     _docker_image: Optional[DockerImage] = None
     _git_repo: Optional[GitRepo] = None
+    _mcore_git_repo: Optional[GitRepo] = None
 
     @property
     def docker_image(self) -> DockerImage:
@@ -31,9 +34,18 @@ def git_repo(self) -> GitRepo:
 
         return self._git_repo
 
+    @property
+    def mcore_vfm_git_repo(self) -> GitRepo:
+        if not self._mcore_git_repo:
+            self._mcore_git_repo = GitRepo(
+                git_url=self.cmd_args.mcore_vfm_repo, commit_hash=self.cmd_args.mcore_vfm_commit_hash
+            )
+
+        return self._mcore_git_repo
+
     @property
     def installables(self) -> list[Installable]:
-        return [self.docker_image, self.git_repo]
+        return [self.docker_image, self.git_repo, self.mcore_vfm_git_repo]
 
     @property
     def extra_args_str(self) -> str:

From 7b661e66262da772a630c97efdd1eff4b00bc241 Mon Sep 17 00:00:00 2001
From: Andrei Maslennikov <andreyma@nvidia.com>
Date: Fri, 8 Nov 2024 03:29:33 -0800
Subject: [PATCH 07/21] Update configs

---
 ...-vfm-single.toml => nemo-vfm-ditllama28b_8k.toml} | 10 +++++-----
 conf/new/test_scenario/nemo-vfm.toml                 | 12 ++++++------
 2 files changed, 11 insertions(+), 11 deletions(-)
 rename conf/new/test/{nemo-vfm-single.toml => nemo-vfm-ditllama28b_8k.toml} (63%)

diff --git a/conf/new/test/nemo-vfm-single.toml b/conf/new/test/nemo-vfm-ditllama28b_8k.toml
similarity index 63%
rename from conf/new/test/nemo-vfm-single.toml
rename to conf/new/test/nemo-vfm-ditllama28b_8k.toml
index ba10a5e4..a35ce35c 100644
--- a/conf/new/test/nemo-vfm-single.toml
+++ b/conf/new/test/nemo-vfm-ditllama28b_8k.toml
@@ -14,19 +14,19 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-name = "nemo-vfm-single"
-description = "Nemo VFM for single node"
+name = "nemo-vfm-ditllama28b_8k"
+description = "Nemo VFM factory=mock_ditllama28b_8k"
 test_template_name = "GenericSlurmContainer"
 
 [cmd_args]
-docker_image_url = "gitlab-master.nvidia.com/dl/nemo/nemo-vfm:24.07.training.2"
+docker_image_url = "gitlab-master.nvidia.com/dl/nemo/nemo-vfm:24.10"
 repository_url = "ssh://git@gitlab-master.nvidia.com:12051/dl/nemo/nemo-vfm.git"
 repository_commit_hash = "98d67aaf64fa74811483a30abf354ba75b0ba416"               # benchmark branch
 mcore_vfm_repo = "ssh://git@gitlab-master.nvidia.com:12051/dl/nemo/mcore-vfm.git"
-mcore_vfm_commit_hash = "7e9490ad83439a2db96a4af557aed32a9ce72ef7"                # main branch
+mcore_vfm_commit_hash = "e14f0b211aec63c7e62385a686b455b6ce5c404d"                # fsdp branch
 
 [extra_cmd_args]
-"bash" = '-c "cd /work ; pwd ; ls ; git log -1 ; python -u nemo/collections/multimodal/vfm/train.py --yes --factory mock_ditllama28b_8k trainer.strategy.tensor_model_parallel_size=1 trainer.strategy.sequence_parallel=False trainer.strategy.context_parallel_size=1 model.config.recompute_granularity=full model.config.recompute_method=uniform model.config.recompute_num_layers=1 data.micro_batch_size=1 data.global_batch_size=256 data.seq_length=65536 data.task_encoder.seq_length=65536"'
+"bash" = '-c "cd /work ; pwd ; ls ; git log -1 ; export PYTHONPATH=`pwd`:/opt/megatron-lm/:$PYTHONPATH ; python -u nemo/collections/diffusion/train.py --yes --factory mock_ditllama28b_8k trainer.strategy.tensor_model_parallel_size=1 trainer.strategy.sequence_parallel=False trainer.strategy.context_parallel_size=1 model.config.recompute_granularity=full model.config.recompute_method=uniform model.config.recompute_num_layers=1 data.micro_batch_size=1 data.global_batch_size=256 data.seq_length=65536 data.task_encoder.seq_length=65536 trainer.max_steps=10"'
 
 [extra_env_vars]
 "WANDB_PROJECT" = "vfm"
diff --git a/conf/new/test_scenario/nemo-vfm.toml b/conf/new/test_scenario/nemo-vfm.toml
index 1ba879d5..f290fe1a 100644
--- a/conf/new/test_scenario/nemo-vfm.toml
+++ b/conf/new/test_scenario/nemo-vfm.toml
@@ -18,12 +18,12 @@ name = "nemo-vfm"
 
 [[Tests]]
 id = "Tests.1"
-test_name = "nemo-vfm-single"
+test_name = "nemo-vfm-ditllama28b_8k"
 num_nodes = 1
 time_limit = "01:00:00"
 
-# [[Tests]]
-# id = "Tests.2"
-# test_name = "nemo-vfm-single"
-# num_nodes = 2
-# time_limit = "01:00:00"
+[[Tests]]
+id = "Tests.2"
+test_name = "nemo-vfm-ditllama28b_8k"
+num_nodes = 8
+time_limit = "01:00:00"

From 8d630efd6d9754809d6fd58951f8d79cd9ef4553 Mon Sep 17 00:00:00 2001
From: Andrei Maslennikov <andreyma@nvidia.com>
Date: Fri, 8 Nov 2024 04:18:17 -0800
Subject: [PATCH 08/21] Add more configs

---
 conf/new/test/nemo-vfm-mock_dit7b_65k.toml    | 36 +++++++++++++++++++
 conf/new/test/nemo-vfm-mock_dit7b_8k.toml     | 36 +++++++++++++++++++
 .../test/nemo-vfm-mock_ditllama28b_65k.toml   | 36 +++++++++++++++++++
 conf/new/test_scenario/nemo-vfm.toml          | 22 +++++++++---
 4 files changed, 125 insertions(+), 5 deletions(-)
 create mode 100644 conf/new/test/nemo-vfm-mock_dit7b_65k.toml
 create mode 100644 conf/new/test/nemo-vfm-mock_dit7b_8k.toml
 create mode 100644 conf/new/test/nemo-vfm-mock_ditllama28b_65k.toml

diff --git a/conf/new/test/nemo-vfm-mock_dit7b_65k.toml b/conf/new/test/nemo-vfm-mock_dit7b_65k.toml
new file mode 100644
index 00000000..bc1bd453
--- /dev/null
+++ b/conf/new/test/nemo-vfm-mock_dit7b_65k.toml
@@ -0,0 +1,36 @@
+# SPDX-FileCopyrightText: NVIDIA CORPORATION & AFFILIATES
+# Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+name = "nemo-vfm-mock_dit7b_65k"
+description = "Nemo VFM factory=mock_mock_dit7b_65k"
+test_template_name = "GenericSlurmContainer"
+
+[cmd_args]
+docker_image_url = "gitlab-master.nvidia.com/dl/nemo/nemo-vfm:24.10"
+repository_url = "ssh://git@gitlab-master.nvidia.com:12051/dl/nemo/nemo-vfm.git"
+repository_commit_hash = "98d67aaf64fa74811483a30abf354ba75b0ba416"               # benchmark branch
+mcore_vfm_repo = "ssh://git@gitlab-master.nvidia.com:12051/dl/nemo/mcore-vfm.git"
+mcore_vfm_commit_hash = "e14f0b211aec63c7e62385a686b455b6ce5c404d"                # fsdp branch
+
+[extra_cmd_args]
+"bash" = '-c "cd /work ; pwd ; ls ; git log -1 ; export PYTHONPATH=`pwd`:/opt/megatron-lm/:$PYTHONPATH ; python -u nemo/collections/diffusion/train.py --yes --factory mock_mock_dit7b_65k trainer.strategy.tensor_model_parallel_size=1 trainer.strategy.sequence_parallel=False trainer.strategy.context_parallel_size=1 model.config.recompute_granularity=full model.config.recompute_method=uniform model.config.recompute_num_layers=1 data.micro_batch_size=1 data.global_batch_size=256 data.seq_length=65536 data.task_encoder.seq_length=65536 trainer.max_steps=10"'
+
+[extra_env_vars]
+"WANDB_PROJECT" = "vfm"
+"WANDB_RESUME" = "allow"
+"NVTE_FUSED_ATTN" = "0"
+"CUDA_DEVICE_MAX_CONNECTIONS" = "1"
+"PYTORCH_CUDA_ALLOC_CONF" = "expandable_segments:True"
diff --git a/conf/new/test/nemo-vfm-mock_dit7b_8k.toml b/conf/new/test/nemo-vfm-mock_dit7b_8k.toml
new file mode 100644
index 00000000..208cb60a
--- /dev/null
+++ b/conf/new/test/nemo-vfm-mock_dit7b_8k.toml
@@ -0,0 +1,36 @@
+# SPDX-FileCopyrightText: NVIDIA CORPORATION & AFFILIATES
+# Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+name = "nemo-vfm-mock_dit7b_8k"
+description = "Nemo VFM factory=mock_mock_dit7b_8k"
+test_template_name = "GenericSlurmContainer"
+
+[cmd_args]
+docker_image_url = "gitlab-master.nvidia.com/dl/nemo/nemo-vfm:24.10"
+repository_url = "ssh://git@gitlab-master.nvidia.com:12051/dl/nemo/nemo-vfm.git"
+repository_commit_hash = "98d67aaf64fa74811483a30abf354ba75b0ba416"               # benchmark branch
+mcore_vfm_repo = "ssh://git@gitlab-master.nvidia.com:12051/dl/nemo/mcore-vfm.git"
+mcore_vfm_commit_hash = "e14f0b211aec63c7e62385a686b455b6ce5c404d"                # fsdp branch
+
+[extra_cmd_args]
+"bash" = '-c "cd /work ; pwd ; ls ; git log -1 ; export PYTHONPATH=`pwd`:/opt/megatron-lm/:$PYTHONPATH ; python -u nemo/collections/diffusion/train.py --yes --factory mock_mock_dit7b_8k trainer.strategy.tensor_model_parallel_size=1 trainer.strategy.sequence_parallel=False trainer.strategy.context_parallel_size=1 model.config.recompute_granularity=full model.config.recompute_method=uniform model.config.recompute_num_layers=1 data.micro_batch_size=1 data.global_batch_size=256 data.seq_length=65536 data.task_encoder.seq_length=65536 trainer.max_steps=10"'
+
+[extra_env_vars]
+"WANDB_PROJECT" = "vfm"
+"WANDB_RESUME" = "allow"
+"NVTE_FUSED_ATTN" = "0"
+"CUDA_DEVICE_MAX_CONNECTIONS" = "1"
+"PYTORCH_CUDA_ALLOC_CONF" = "expandable_segments:True"
diff --git a/conf/new/test/nemo-vfm-mock_ditllama28b_65k.toml b/conf/new/test/nemo-vfm-mock_ditllama28b_65k.toml
new file mode 100644
index 00000000..a27e18e3
--- /dev/null
+++ b/conf/new/test/nemo-vfm-mock_ditllama28b_65k.toml
@@ -0,0 +1,36 @@
+# SPDX-FileCopyrightText: NVIDIA CORPORATION & AFFILIATES
+# Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+name = "nemo-vfm-mock_ditllama28b_65k"
+description = "Nemo VFM factory=mock_mock_ditllama28b_65k"
+test_template_name = "GenericSlurmContainer"
+
+[cmd_args]
+docker_image_url = "gitlab-master.nvidia.com/dl/nemo/nemo-vfm:24.10"
+repository_url = "ssh://git@gitlab-master.nvidia.com:12051/dl/nemo/nemo-vfm.git"
+repository_commit_hash = "98d67aaf64fa74811483a30abf354ba75b0ba416"               # benchmark branch
+mcore_vfm_repo = "ssh://git@gitlab-master.nvidia.com:12051/dl/nemo/mcore-vfm.git"
+mcore_vfm_commit_hash = "e14f0b211aec63c7e62385a686b455b6ce5c404d"                # fsdp branch
+
+[extra_cmd_args]
+"bash" = '-c "cd /work ; pwd ; ls ; git log -1 ; export PYTHONPATH=`pwd`:/opt/megatron-lm/:$PYTHONPATH ; python -u nemo/collections/diffusion/train.py --yes --factory mock_mock_ditllama28b_65k trainer.strategy.tensor_model_parallel_size=1 trainer.strategy.sequence_parallel=False trainer.strategy.context_parallel_size=1 model.config.recompute_granularity=full model.config.recompute_method=uniform model.config.recompute_num_layers=1 data.micro_batch_size=1 data.global_batch_size=256 data.seq_length=65536 data.task_encoder.seq_length=65536 trainer.max_steps=10"'
+
+[extra_env_vars]
+"WANDB_PROJECT" = "vfm"
+"WANDB_RESUME" = "allow"
+"NVTE_FUSED_ATTN" = "0"
+"CUDA_DEVICE_MAX_CONNECTIONS" = "1"
+"PYTORCH_CUDA_ALLOC_CONF" = "expandable_segments:True"
diff --git a/conf/new/test_scenario/nemo-vfm.toml b/conf/new/test_scenario/nemo-vfm.toml
index f290fe1a..1a695dbb 100644
--- a/conf/new/test_scenario/nemo-vfm.toml
+++ b/conf/new/test_scenario/nemo-vfm.toml
@@ -17,13 +17,25 @@
 name = "nemo-vfm"
 
 [[Tests]]
-id = "Tests.1"
-test_name = "nemo-vfm-ditllama28b_8k"
-num_nodes = 1
+id = "Tests.mock.dit7b_8k"
+test_name = "nemo-vfm-mock_dit7b_8k"
+num_nodes = 8
+time_limit = "01:00:00"
+
+[[Tests]]
+id = "Tests.mock.dit7b_65k"
+test_name = "nemo-vfm-mock_dit7b_65k"
+num_nodes = 8
+time_limit = "01:00:00"
+
+[[Tests]]
+id = "Tests.mock.ditllama28b_8k"
+test_name = "nemo-vfm-mock_ditllama28b_8k"
+num_nodes = 8
 time_limit = "01:00:00"
 
 [[Tests]]
-id = "Tests.2"
-test_name = "nemo-vfm-ditllama28b_8k"
+id = "Tests.mock.ditllama28b_65k"
+test_name = "nemo-vfm-mock_ditllama28b_65k"
 num_nodes = 8
 time_limit = "01:00:00"

From 2398218ffa7648c666318fe1d1024f9d811b7f59 Mon Sep 17 00:00:00 2001
From: Andrey Maslennikov <andreyma@nvidia.com>
Date: Fri, 8 Nov 2024 15:24:55 +0200
Subject: [PATCH 09/21] Fixes and tuning

---
 conf/new/test/nemo-vfm-mock_dit7b_65k.toml                    | 4 ++--
 conf/new/test/nemo-vfm-mock_dit7b_8k.toml                     | 4 ++--
 conf/new/test/nemo-vfm-mock_ditllama28b_65k.toml              | 4 ++--
 ...-ditllama28b_8k.toml => nemo-vfm-mock_ditllama28b_8k.toml} | 2 +-
 .../generic_slurm_container/slurm_command_gen_strategy.py     | 2 +-
 5 files changed, 8 insertions(+), 8 deletions(-)
 rename conf/new/test/{nemo-vfm-ditllama28b_8k.toml => nemo-vfm-mock_ditllama28b_8k.toml} (98%)

diff --git a/conf/new/test/nemo-vfm-mock_dit7b_65k.toml b/conf/new/test/nemo-vfm-mock_dit7b_65k.toml
index bc1bd453..deb12387 100644
--- a/conf/new/test/nemo-vfm-mock_dit7b_65k.toml
+++ b/conf/new/test/nemo-vfm-mock_dit7b_65k.toml
@@ -15,7 +15,7 @@
 # limitations under the License.
 
 name = "nemo-vfm-mock_dit7b_65k"
-description = "Nemo VFM factory=mock_mock_dit7b_65k"
+description = "Nemo VFM factory=mock_dit7b_65k"
 test_template_name = "GenericSlurmContainer"
 
 [cmd_args]
@@ -26,7 +26,7 @@ mcore_vfm_repo = "ssh://git@gitlab-master.nvidia.com:12051/dl/nemo/mcore-vfm.git
 mcore_vfm_commit_hash = "e14f0b211aec63c7e62385a686b455b6ce5c404d"                # fsdp branch
 
 [extra_cmd_args]
-"bash" = '-c "cd /work ; pwd ; ls ; git log -1 ; export PYTHONPATH=`pwd`:/opt/megatron-lm/:$PYTHONPATH ; python -u nemo/collections/diffusion/train.py --yes --factory mock_mock_dit7b_65k trainer.strategy.tensor_model_parallel_size=1 trainer.strategy.sequence_parallel=False trainer.strategy.context_parallel_size=1 model.config.recompute_granularity=full model.config.recompute_method=uniform model.config.recompute_num_layers=1 data.micro_batch_size=1 data.global_batch_size=256 data.seq_length=65536 data.task_encoder.seq_length=65536 trainer.max_steps=10"'
+"bash" = '-c "cd /work ; pwd ; ls ; git log -1 ; export PYTHONPATH=`pwd`:/opt/megatron-lm/:$PYTHONPATH ; python -u nemo/collections/diffusion/train.py --yes --factory mock_dit7b_65k trainer.strategy.tensor_model_parallel_size=1 trainer.strategy.sequence_parallel=False trainer.strategy.context_parallel_size=1 model.config.recompute_granularity=full model.config.recompute_method=uniform model.config.recompute_num_layers=1 data.micro_batch_size=1 data.global_batch_size=256 data.seq_length=65536 data.task_encoder.seq_length=65536 trainer.max_steps=10"'
 
 [extra_env_vars]
 "WANDB_PROJECT" = "vfm"
diff --git a/conf/new/test/nemo-vfm-mock_dit7b_8k.toml b/conf/new/test/nemo-vfm-mock_dit7b_8k.toml
index 208cb60a..02f45c13 100644
--- a/conf/new/test/nemo-vfm-mock_dit7b_8k.toml
+++ b/conf/new/test/nemo-vfm-mock_dit7b_8k.toml
@@ -15,7 +15,7 @@
 # limitations under the License.
 
 name = "nemo-vfm-mock_dit7b_8k"
-description = "Nemo VFM factory=mock_mock_dit7b_8k"
+description = "Nemo VFM factory=mock_dit7b_8k"
 test_template_name = "GenericSlurmContainer"
 
 [cmd_args]
@@ -26,7 +26,7 @@ mcore_vfm_repo = "ssh://git@gitlab-master.nvidia.com:12051/dl/nemo/mcore-vfm.git
 mcore_vfm_commit_hash = "e14f0b211aec63c7e62385a686b455b6ce5c404d"                # fsdp branch
 
 [extra_cmd_args]
-"bash" = '-c "cd /work ; pwd ; ls ; git log -1 ; export PYTHONPATH=`pwd`:/opt/megatron-lm/:$PYTHONPATH ; python -u nemo/collections/diffusion/train.py --yes --factory mock_mock_dit7b_8k trainer.strategy.tensor_model_parallel_size=1 trainer.strategy.sequence_parallel=False trainer.strategy.context_parallel_size=1 model.config.recompute_granularity=full model.config.recompute_method=uniform model.config.recompute_num_layers=1 data.micro_batch_size=1 data.global_batch_size=256 data.seq_length=65536 data.task_encoder.seq_length=65536 trainer.max_steps=10"'
+"bash" = '-c "cd /work ; pwd ; ls ; git log -1 ; export PYTHONPATH=`pwd`:/opt/megatron-lm/:$PYTHONPATH ; python -u nemo/collections/diffusion/train.py --yes --factory mock_dit7b_8k trainer.strategy.tensor_model_parallel_size=1 trainer.strategy.sequence_parallel=False trainer.strategy.context_parallel_size=1 model.config.recompute_granularity=full model.config.recompute_method=uniform model.config.recompute_num_layers=1 data.micro_batch_size=1 data.global_batch_size=256 data.seq_length=65536 data.task_encoder.seq_length=65536 trainer.max_steps=10"'
 
 [extra_env_vars]
 "WANDB_PROJECT" = "vfm"
diff --git a/conf/new/test/nemo-vfm-mock_ditllama28b_65k.toml b/conf/new/test/nemo-vfm-mock_ditllama28b_65k.toml
index a27e18e3..fec9c947 100644
--- a/conf/new/test/nemo-vfm-mock_ditllama28b_65k.toml
+++ b/conf/new/test/nemo-vfm-mock_ditllama28b_65k.toml
@@ -15,7 +15,7 @@
 # limitations under the License.
 
 name = "nemo-vfm-mock_ditllama28b_65k"
-description = "Nemo VFM factory=mock_mock_ditllama28b_65k"
+description = "Nemo VFM factory=mock_ditllama28b_65k"
 test_template_name = "GenericSlurmContainer"
 
 [cmd_args]
@@ -26,7 +26,7 @@ mcore_vfm_repo = "ssh://git@gitlab-master.nvidia.com:12051/dl/nemo/mcore-vfm.git
 mcore_vfm_commit_hash = "e14f0b211aec63c7e62385a686b455b6ce5c404d"                # fsdp branch
 
 [extra_cmd_args]
-"bash" = '-c "cd /work ; pwd ; ls ; git log -1 ; export PYTHONPATH=`pwd`:/opt/megatron-lm/:$PYTHONPATH ; python -u nemo/collections/diffusion/train.py --yes --factory mock_mock_ditllama28b_65k trainer.strategy.tensor_model_parallel_size=1 trainer.strategy.sequence_parallel=False trainer.strategy.context_parallel_size=1 model.config.recompute_granularity=full model.config.recompute_method=uniform model.config.recompute_num_layers=1 data.micro_batch_size=1 data.global_batch_size=256 data.seq_length=65536 data.task_encoder.seq_length=65536 trainer.max_steps=10"'
+"bash" = '-c "cd /work ; pwd ; ls ; git log -1 ; export PYTHONPATH=`pwd`:/opt/megatron-lm/:$PYTHONPATH ; python -u nemo/collections/diffusion/train.py --yes --factory mock_ditllama28b_65k trainer.strategy.tensor_model_parallel_size=1 trainer.strategy.sequence_parallel=False trainer.strategy.context_parallel_size=1 model.config.recompute_granularity=full model.config.recompute_method=uniform model.config.recompute_num_layers=1 data.micro_batch_size=1 data.global_batch_size=256 data.seq_length=65536 data.task_encoder.seq_length=65536 trainer.max_steps=10"'
 
 [extra_env_vars]
 "WANDB_PROJECT" = "vfm"
diff --git a/conf/new/test/nemo-vfm-ditllama28b_8k.toml b/conf/new/test/nemo-vfm-mock_ditllama28b_8k.toml
similarity index 98%
rename from conf/new/test/nemo-vfm-ditllama28b_8k.toml
rename to conf/new/test/nemo-vfm-mock_ditllama28b_8k.toml
index a35ce35c..9ff187f2 100644
--- a/conf/new/test/nemo-vfm-ditllama28b_8k.toml
+++ b/conf/new/test/nemo-vfm-mock_ditllama28b_8k.toml
@@ -14,7 +14,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-name = "nemo-vfm-ditllama28b_8k"
+name = "nemo-vfm-mock_ditllama28b_8k"
 description = "Nemo VFM factory=mock_ditllama28b_8k"
 test_template_name = "GenericSlurmContainer"
 
diff --git a/src/cloudai/schema/test_template/generic_slurm_container/slurm_command_gen_strategy.py b/src/cloudai/schema/test_template/generic_slurm_container/slurm_command_gen_strategy.py
index 304e14ee..895924a2 100644
--- a/src/cloudai/schema/test_template/generic_slurm_container/slurm_command_gen_strategy.py
+++ b/src/cloudai/schema/test_template/generic_slurm_container/slurm_command_gen_strategy.py
@@ -28,7 +28,7 @@ def generate_srun_prefix(self, slurm_args: dict[str, Any], tr: TestRun) -> list[
         slurm_args["image_path"] = tdef.docker_image.installed_path
         repo_path = tdef.git_repo.installed_path or Path.cwd()
         mcore_vfm_path = tdef.mcore_vfm_git_repo.installed_path or Path.cwd()
-        slurm_args["container_mounts"] = f"{repo_path.absolute()}:/work,/lustre:/lustre/,{mcore_vfm_path.absolute()}:/opt/megatron-lm"
+        slurm_args["container_mounts"] = f"{repo_path.absolute()}:/work,{mcore_vfm_path.absolute()}:/opt/megatron-lm"
 
         cmd = super().generate_srun_prefix(slurm_args, tr)
         return cmd + ["--no-container-mount-home"]

From 4f03e0251f2cf55ba12181767b40b1b056bcd059 Mon Sep 17 00:00:00 2001
From: Andrey Maslennikov <andreyma@nvidia.com>
Date: Mon, 11 Nov 2024 13:02:05 +0200
Subject: [PATCH 10/21] Add csv report generation for GenericSlurmContainer

---
 src/cloudai/__init__.py                       |  7 ++
 .../report_generator/report_generator.py      | 14 ++--
 .../report_generation_strategy.py             | 64 +++++++++++++++++++
 3 files changed, 81 insertions(+), 4 deletions(-)
 create mode 100644 src/cloudai/schema/test_template/generic_slurm_container/report_generation_strategy.py

diff --git a/src/cloudai/__init__.py b/src/cloudai/__init__.py
index 3fdee20d..eb22d844 100644
--- a/src/cloudai/__init__.py
+++ b/src/cloudai/__init__.py
@@ -57,6 +57,9 @@
 from .schema.test_template.common.default_job_status_retrieval_strategy import DefaultJobStatusRetrievalStrategy
 from .schema.test_template.common.slurm_job_id_retrieval_strategy import SlurmJobIdRetrievalStrategy
 from .schema.test_template.common.standalone_job_id_retrieval_strategy import StandaloneJobIdRetrievalStrategy
+from .schema.test_template.generic_slurm_container.report_generation_strategy import (
+    GenericSlurmContainerReportGenerationStrategy,
+)
 from .schema.test_template.generic_slurm_container.slurm_command_gen_strategy import (
     GenericSlurmContainerCommandGenStrategy,
 )
@@ -126,7 +129,11 @@
 Registry().add_strategy(JobIdRetrievalStrategy, [SlurmSystem], [NeMoLauncher], NeMoLauncherSlurmJobIdRetrievalStrategy)
 Registry().add_strategy(CommandGenStrategy, [SlurmSystem], [NeMoLauncher], NeMoLauncherSlurmCommandGenStrategy)
 Registry().add_strategy(ReportGenerationStrategy, [SlurmSystem], [UCCTest], UCCTestReportGenerationStrategy)
+Registry().add_strategy(
+    ReportGenerationStrategy, [SlurmSystem], [GenericSlurmContainerTT], GenericSlurmContainerReportGenerationStrategy
+)
 Registry().add_strategy(GradingStrategy, [SlurmSystem], [NeMoLauncher], NeMoLauncherGradingStrategy)
+
 Registry().add_strategy(GradingStrategy, [SlurmSystem], [JaxToolbox], JaxToolboxGradingStrategy)
 Registry().add_strategy(GradingStrategy, [SlurmSystem], [UCCTest], UCCTestGradingStrategy)
 Registry().add_strategy(CommandGenStrategy, [SlurmSystem], [JaxToolbox], JaxToolboxSlurmCommandGenStrategy)
diff --git a/src/cloudai/report_generator/report_generator.py b/src/cloudai/report_generator/report_generator.py
index 3c8a7e2a..d2b149d2 100644
--- a/src/cloudai/report_generator/report_generator.py
+++ b/src/cloudai/report_generator/report_generator.py
@@ -70,7 +70,13 @@ def _generate_test_report(self, directory_path: Path, tr: TestRun) -> None:
             tr (TestRun): The test run object.
         """
         for subdir in directory_path.iterdir():
-            if subdir.is_dir() and tr.test.test_template.can_handle_directory(subdir):
-                tr.test.test_template.generate_report(tr.test.name, subdir, tr.sol)
-            else:
-                logging.warning(f"Skipping directory '{subdir}' for test '{tr.test.name}'")
+            if not subdir.is_dir():
+                logging.debug(f"Skipping file '{subdir}', not a directory.")
+                continue
+            if not tr.test.test_template.can_handle_directory(subdir):
+                logging.warning(
+                    f"Skipping '{subdir}', can't hande with strategy={tr.test.test_template.report_generation_strategy}."
+                )
+                continue
+
+            tr.test.test_template.generate_report(tr.test.name, subdir, tr.sol)
diff --git a/src/cloudai/schema/test_template/generic_slurm_container/report_generation_strategy.py b/src/cloudai/schema/test_template/generic_slurm_container/report_generation_strategy.py
new file mode 100644
index 00000000..9151e3e7
--- /dev/null
+++ b/src/cloudai/schema/test_template/generic_slurm_container/report_generation_strategy.py
@@ -0,0 +1,64 @@
+# SPDX-FileCopyrightText: NVIDIA CORPORATION & AFFILIATES
+# Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import re
+from pathlib import Path
+from typing import Optional
+
+from cloudai import ReportGenerationStrategy
+
+
+class GenericSlurmContainerReportGenerationStrategy(ReportGenerationStrategy):
+    def can_handle_directory(self, directory_path: Path) -> bool:
+        stdout_path = directory_path / "stdout.txt"
+        if stdout_path.exists():
+            with stdout_path.open("r") as file:
+                if re.search(
+                    r"Training epoch \d+, iteration \d+/\d+ | lr: [\d.]+ | global_batch_size: \d+ | global_step: \d+ | "
+                    r"reduced_train_loss: [\d.]+ | train_step_timing in s: [\d.]+",
+                    file.read(),
+                ):
+                    return True
+        return False
+
+    def generate_report(self, test_name: str, directory_path: Path, sol: Optional[float] = None) -> None:
+        stdout_path = directory_path / "stdout.txt"
+        if not stdout_path.is_file():
+            return
+
+        # Training epoch 0, iteration 1/9 | lr: 0.0001 | global_batch_size: 256 | global_step: 1 | reduced_train_loss: 3.616 | train_step_timing in s: 112.7 | consumed_samples: 512
+        # parse data from stdout.txt and save into csv file
+        with stdout_path.open("r") as file:
+            lines = file.readlines()
+            with open(directory_path / "report.csv", "w") as csv_file:
+                csv_file.write(
+                    "epoch,iteration,lr,global_batch_size,global_step,reduced_train_loss,train_step_timing,consumed_samples\n"
+                )
+                for line in lines:
+                    pattern = (
+                        r"Training epoch (\d+), iteration (\d+)/\d+ \| lr: ([\d.]+) \| global_batch_size: (\d+) \| "
+                        r"global_step: (\d+) \| reduced_train_loss: ([\d.]+) \| train_step_timing in s: ([\d.]+)"
+                    )
+                    if " | consumed_samples:" in line:
+                        pattern = (
+                            r"Training epoch (\d+), iteration (\d+)/\d+ \| lr: ([\d.]+) \| global_batch_size: (\d+) \| "
+                            r"global_step: (\d+) \| reduced_train_loss: ([\d.]+) \| train_step_timing in s: ([\d.]+) "
+                            r"\| consumed_samples: (\d+)"
+                        )
+
+                    match = re.match(pattern, line)
+                    if match:
+                        csv_file.write(",".join(match.groups()) + "\n")

From 4a53e9c0f26db46f4cb2731216417879cee1a02a Mon Sep 17 00:00:00 2001
From: Andrey Maslennikov <andreyma@nvidia.com>
Date: Mon, 11 Nov 2024 13:29:31 +0200
Subject: [PATCH 11/21] Make linters happy and tests pass

---
 conf/common/system/example_slurm_cluster.toml |  2 +-
 src/cloudai/__init__.py                       | 15 ++++++++------
 .../report_generator/report_generator.py      |  3 ++-
 .../report_generation_strategy.py             |  4 ++--
 .../slurm_command_gen_strategy.py             |  2 ++
 .../generic_slurm_container/template.py       |  4 +++-
 .../generic_slurm_container.py                | 20 +++++++++++++++++++
 tests/test_init.py                            |  8 ++++++--
 8 files changed, 45 insertions(+), 13 deletions(-)

diff --git a/conf/common/system/example_slurm_cluster.toml b/conf/common/system/example_slurm_cluster.toml
index ddf2f210..ff795171 100644
--- a/conf/common/system/example_slurm_cluster.toml
+++ b/conf/common/system/example_slurm_cluster.toml
@@ -17,7 +17,7 @@
 name = "example-cluster"
 scheduler = "slurm"
 
-install_path = "./install"
+install_path = "./install_dir"
 output_path = "./results"
 default_partition = "partition_1"
 
diff --git a/src/cloudai/__init__.py b/src/cloudai/__init__.py
index eb22d844..eeaa022b 100644
--- a/src/cloudai/__init__.py
+++ b/src/cloudai/__init__.py
@@ -63,7 +63,7 @@
 from .schema.test_template.generic_slurm_container.slurm_command_gen_strategy import (
     GenericSlurmContainerCommandGenStrategy,
 )
-from .schema.test_template.generic_slurm_container.template import GenericSlurmContainerTT
+from .schema.test_template.generic_slurm_container.template import GenericSlurmContainer
 from .schema.test_template.jax_toolbox.grading_strategy import JaxToolboxGradingStrategy
 from .schema.test_template.jax_toolbox.job_status_retrieval_strategy import JaxToolboxJobStatusRetrievalStrategy
 from .schema.test_template.jax_toolbox.report_generation_strategy import JaxToolboxReportGenerationStrategy
@@ -130,7 +130,10 @@
 Registry().add_strategy(CommandGenStrategy, [SlurmSystem], [NeMoLauncher], NeMoLauncherSlurmCommandGenStrategy)
 Registry().add_strategy(ReportGenerationStrategy, [SlurmSystem], [UCCTest], UCCTestReportGenerationStrategy)
 Registry().add_strategy(
-    ReportGenerationStrategy, [SlurmSystem], [GenericSlurmContainerTT], GenericSlurmContainerReportGenerationStrategy
+    ReportGenerationStrategy,
+    [SlurmSystem],
+    [GenericSlurmContainer],
+    GenericSlurmContainerReportGenerationStrategy,
 )
 Registry().add_strategy(GradingStrategy, [SlurmSystem], [NeMoLauncher], NeMoLauncherGradingStrategy)
 
@@ -140,7 +143,7 @@
 Registry().add_strategy(
     JobIdRetrievalStrategy,
     [SlurmSystem],
-    [ChakraReplay, JaxToolbox, NcclTest, UCCTest, Sleep, GenericSlurmContainerTT],
+    [ChakraReplay, JaxToolbox, NcclTest, UCCTest, Sleep, GenericSlurmContainer],
     SlurmJobIdRetrievalStrategy,
 )
 Registry().add_strategy(JobIdRetrievalStrategy, [StandaloneSystem], [Sleep], StandaloneJobIdRetrievalStrategy)
@@ -153,7 +156,7 @@
 Registry().add_strategy(
     JobStatusRetrievalStrategy,
     [SlurmSystem],
-    [ChakraReplay, UCCTest, NeMoLauncher, Sleep, GenericSlurmContainerTT],
+    [ChakraReplay, UCCTest, NeMoLauncher, Sleep, GenericSlurmContainer],
     DefaultJobStatusRetrievalStrategy,
 )
 Registry().add_strategy(CommandGenStrategy, [SlurmSystem], [UCCTest], UCCTestSlurmCommandGenStrategy)
@@ -161,7 +164,7 @@
 Registry().add_strategy(GradingStrategy, [SlurmSystem], [ChakraReplay], ChakraReplayGradingStrategy)
 Registry().add_strategy(CommandGenStrategy, [SlurmSystem], [ChakraReplay], ChakraReplaySlurmCommandGenStrategy)
 Registry().add_strategy(
-    CommandGenStrategy, [SlurmSystem], [GenericSlurmContainerTT], GenericSlurmContainerCommandGenStrategy
+    CommandGenStrategy, [SlurmSystem], [GenericSlurmContainer], GenericSlurmContainerCommandGenStrategy
 )
 
 Registry().add_installer("slurm", SlurmInstaller)
@@ -190,7 +193,7 @@
 Registry().add_test_template("JaxToolboxGPT", JaxToolbox)
 Registry().add_test_template("JaxToolboxGrok", JaxToolbox)
 Registry().add_test_template("JaxToolboxNemotron", JaxToolbox)
-Registry().add_test_template("GenericSlurmContainer", GenericSlurmContainerTT)
+Registry().add_test_template("GenericSlurmContainer", GenericSlurmContainer)
 
 __all__ = [
     "BaseInstaller",
diff --git a/src/cloudai/report_generator/report_generator.py b/src/cloudai/report_generator/report_generator.py
index d2b149d2..9d7ef563 100644
--- a/src/cloudai/report_generator/report_generator.py
+++ b/src/cloudai/report_generator/report_generator.py
@@ -75,7 +75,8 @@ def _generate_test_report(self, directory_path: Path, tr: TestRun) -> None:
                 continue
             if not tr.test.test_template.can_handle_directory(subdir):
                 logging.warning(
-                    f"Skipping '{subdir}', can't hande with strategy={tr.test.test_template.report_generation_strategy}."
+                    f"Skipping '{subdir}', can't hande with "
+                    f"strategy={tr.test.test_template.report_generation_strategy}."
                 )
                 continue
 
diff --git a/src/cloudai/schema/test_template/generic_slurm_container/report_generation_strategy.py b/src/cloudai/schema/test_template/generic_slurm_container/report_generation_strategy.py
index 9151e3e7..82cc00a9 100644
--- a/src/cloudai/schema/test_template/generic_slurm_container/report_generation_strategy.py
+++ b/src/cloudai/schema/test_template/generic_slurm_container/report_generation_strategy.py
@@ -22,6 +22,8 @@
 
 
 class GenericSlurmContainerReportGenerationStrategy(ReportGenerationStrategy):
+    """Report generation strategy for a generic Slurm container test."""
+
     def can_handle_directory(self, directory_path: Path) -> bool:
         stdout_path = directory_path / "stdout.txt"
         if stdout_path.exists():
@@ -39,8 +41,6 @@ def generate_report(self, test_name: str, directory_path: Path, sol: Optional[fl
         if not stdout_path.is_file():
             return
 
-        # Training epoch 0, iteration 1/9 | lr: 0.0001 | global_batch_size: 256 | global_step: 1 | reduced_train_loss: 3.616 | train_step_timing in s: 112.7 | consumed_samples: 512
-        # parse data from stdout.txt and save into csv file
         with stdout_path.open("r") as file:
             lines = file.readlines()
             with open(directory_path / "report.csv", "w") as csv_file:
diff --git a/src/cloudai/schema/test_template/generic_slurm_container/slurm_command_gen_strategy.py b/src/cloudai/schema/test_template/generic_slurm_container/slurm_command_gen_strategy.py
index 895924a2..76dab8ca 100644
--- a/src/cloudai/schema/test_template/generic_slurm_container/slurm_command_gen_strategy.py
+++ b/src/cloudai/schema/test_template/generic_slurm_container/slurm_command_gen_strategy.py
@@ -23,6 +23,8 @@
 
 
 class GenericSlurmContainerCommandGenStrategy(SlurmCommandGenStrategy):
+    """Command generation strategy for generic Slurm container tests."""
+
     def generate_srun_prefix(self, slurm_args: dict[str, Any], tr: TestRun) -> list[str]:
         tdef: SlurmContainerTestDefinition = cast(SlurmContainerTestDefinition, tr.test.test_definition)
         slurm_args["image_path"] = tdef.docker_image.installed_path
diff --git a/src/cloudai/schema/test_template/generic_slurm_container/template.py b/src/cloudai/schema/test_template/generic_slurm_container/template.py
index 30d94048..f8b8cb30 100644
--- a/src/cloudai/schema/test_template/generic_slurm_container/template.py
+++ b/src/cloudai/schema/test_template/generic_slurm_container/template.py
@@ -17,5 +17,7 @@
 from cloudai import TestTemplate
 
 
-class GenericSlurmContainerTT(TestTemplate):
+class GenericSlurmContainer(TestTemplate):
+    """Generic Slurm container test template."""
+
     pass
diff --git a/src/cloudai/test_definitions/generic_slurm_container.py b/src/cloudai/test_definitions/generic_slurm_container.py
index 5ec099e8..f4ca6a2b 100644
--- a/src/cloudai/test_definitions/generic_slurm_container.py
+++ b/src/cloudai/test_definitions/generic_slurm_container.py
@@ -1,3 +1,19 @@
+# SPDX-FileCopyrightText: NVIDIA CORPORATION & AFFILIATES
+# Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
 from typing import Optional
 
 from cloudai import CmdArgs, Installable, TestDefinition
@@ -5,6 +21,8 @@
 
 
 class SlurmContainerCmdArgs(CmdArgs):
+    """Command line arguments for a generic Slurm container test."""
+
     docker_image_url: str
     repository_url: str
     repository_commit_hash: str
@@ -13,6 +31,8 @@ class SlurmContainerCmdArgs(CmdArgs):
 
 
 class SlurmContainerTestDefinition(TestDefinition):
+    """Test definition for a generic Slurm container test."""
+
     cmd_args: SlurmContainerCmdArgs
 
     _docker_image: Optional[DockerImage] = None
diff --git a/tests/test_init.py b/tests/test_init.py
index 410e154b..e6f8e2bb 100644
--- a/tests/test_init.py
+++ b/tests/test_init.py
@@ -18,11 +18,13 @@
 
 from cloudai import (
     CommandGenStrategy,
+    GenericSlurmContainer,
     GradingStrategy,
     JobIdRetrievalStrategy,
     JsonGenStrategy,
     Registry,
     ReportGenerationStrategy,
+    SlurmContainerTestDefinition,
 )
 from cloudai.installer.slurm_installer import SlurmInstaller
 from cloudai.installer.standalone_installer import StandaloneInstaller
@@ -127,12 +129,13 @@ def test_strategies(key: tuple, value: type):
 
 def test_test_templates():
     test_templates = Registry().test_templates_map
-    assert len(test_templates) == 8
+    assert len(test_templates) == 9
     assert test_templates["ChakraReplay"] == ChakraReplay
     assert test_templates["NcclTest"] == NcclTest
     assert test_templates["NeMoLauncher"] == NeMoLauncher
     assert test_templates["Sleep"] == Sleep
     assert test_templates["UCCTest"] == UCCTest
+    assert test_templates["GenericSlurmContainer"] == GenericSlurmContainer
 
 
 def test_installers():
@@ -144,12 +147,13 @@ def test_installers():
 
 def test_definitions():
     test_defs = Registry().test_definitions_map
-    assert len(test_defs) == 8
+    assert len(test_defs) == 9
     assert test_defs["UCCTest"] == UCCTestDefinition
     assert test_defs["NcclTest"] == NCCLTestDefinition
     assert test_defs["ChakraReplay"] == ChakraReplayTestDefinition
     assert test_defs["Sleep"] == SleepTestDefinition
     assert test_defs["NeMoLauncher"] == NeMoLauncherTestDefinition
+    assert test_defs["GenericSlurmContainer"] == SlurmContainerTestDefinition
 
 
 def test_definitions_matches_templates():

From 0379c5a58191436d8add9ebaeb449427b1997b80 Mon Sep 17 00:00:00 2001
From: Andrey Maslennikov <andreyma@nvidia.com>
Date: Mon, 11 Nov 2024 13:34:32 +0200
Subject: [PATCH 12/21] Rely on pyproject for running dev tools

---
 .github/workflows/ci.yml | 6 +++---
 pyproject.toml           | 3 +++
 2 files changed, 6 insertions(+), 3 deletions(-)

diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml
index 207d7be5..1077797a 100644
--- a/.github/workflows/ci.yml
+++ b/.github/workflows/ci.yml
@@ -23,13 +23,13 @@ jobs:
         run: pip install -r requirements-dev.txt
 
       - name: Run ruff linter
-        run: ruff check .
+        run: ruff check
 
       - name: Run ruff formatter
-        run: ruff format --check --diff .
+        run: ruff format --check --diff
 
       - name: Run pyright
-        run: pyright .
+        run: pyright
 
       - name: Run vulture check
         run: vulture src/ tests/
diff --git a/pyproject.toml b/pyproject.toml
index a6442964..4ef950cd 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -100,3 +100,6 @@ min_confidence = 100
 
 [tool.coverage.report]
 exclude_also = ["@abstractmethod"]
+
+[tool.pyright]
+include = ["src", "tests"]

From 4d1fbba4c8ae6b9b41cdf932d7db2bd8ac19df83 Mon Sep 17 00:00:00 2001
From: Andrey Maslennikov <andreyma@nvidia.com>
Date: Mon, 11 Nov 2024 13:08:08 +0100
Subject: [PATCH 13/21] Simplify naming

---
 src/cloudai/__init__.py                       | 22 +++++++++----------
 .../report_generation_strategy.py             |  2 +-
 .../slurm_command_gen_strategy.py             |  4 ++--
 .../generic_slurm_container/template.py       |  2 +-
 ..._slurm_container.py => slurm_container.py} |  0
 tests/test_init.py                            |  4 ++--
 6 files changed, 16 insertions(+), 18 deletions(-)
 rename src/cloudai/test_definitions/{generic_slurm_container.py => slurm_container.py} (100%)

diff --git a/src/cloudai/__init__.py b/src/cloudai/__init__.py
index eeaa022b..fe695281 100644
--- a/src/cloudai/__init__.py
+++ b/src/cloudai/__init__.py
@@ -58,12 +58,12 @@
 from .schema.test_template.common.slurm_job_id_retrieval_strategy import SlurmJobIdRetrievalStrategy
 from .schema.test_template.common.standalone_job_id_retrieval_strategy import StandaloneJobIdRetrievalStrategy
 from .schema.test_template.generic_slurm_container.report_generation_strategy import (
-    GenericSlurmContainerReportGenerationStrategy,
+    SlurmContainerReportGenerationStrategy,
 )
 from .schema.test_template.generic_slurm_container.slurm_command_gen_strategy import (
-    GenericSlurmContainerCommandGenStrategy,
+    SlurmContainerCommandGenStrategy,
 )
-from .schema.test_template.generic_slurm_container.template import GenericSlurmContainer
+from .schema.test_template.generic_slurm_container.template import SlurmContainer
 from .schema.test_template.jax_toolbox.grading_strategy import JaxToolboxGradingStrategy
 from .schema.test_template.jax_toolbox.job_status_retrieval_strategy import JaxToolboxJobStatusRetrievalStrategy
 from .schema.test_template.jax_toolbox.report_generation_strategy import JaxToolboxReportGenerationStrategy
@@ -105,7 +105,7 @@
     SleepTestDefinition,
     UCCTestDefinition,
 )
-from .test_definitions.generic_slurm_container import SlurmContainerTestDefinition
+from .test_definitions.slurm_container import SlurmContainerTestDefinition
 
 Registry().add_runner("slurm", SlurmRunner)
 Registry().add_runner("kubernetes", KubernetesRunner)
@@ -132,8 +132,8 @@
 Registry().add_strategy(
     ReportGenerationStrategy,
     [SlurmSystem],
-    [GenericSlurmContainer],
-    GenericSlurmContainerReportGenerationStrategy,
+    [SlurmContainer],
+    SlurmContainerReportGenerationStrategy,
 )
 Registry().add_strategy(GradingStrategy, [SlurmSystem], [NeMoLauncher], NeMoLauncherGradingStrategy)
 
@@ -143,7 +143,7 @@
 Registry().add_strategy(
     JobIdRetrievalStrategy,
     [SlurmSystem],
-    [ChakraReplay, JaxToolbox, NcclTest, UCCTest, Sleep, GenericSlurmContainer],
+    [ChakraReplay, JaxToolbox, NcclTest, UCCTest, Sleep, SlurmContainer],
     SlurmJobIdRetrievalStrategy,
 )
 Registry().add_strategy(JobIdRetrievalStrategy, [StandaloneSystem], [Sleep], StandaloneJobIdRetrievalStrategy)
@@ -156,16 +156,14 @@
 Registry().add_strategy(
     JobStatusRetrievalStrategy,
     [SlurmSystem],
-    [ChakraReplay, UCCTest, NeMoLauncher, Sleep, GenericSlurmContainer],
+    [ChakraReplay, UCCTest, NeMoLauncher, Sleep, SlurmContainer],
     DefaultJobStatusRetrievalStrategy,
 )
 Registry().add_strategy(CommandGenStrategy, [SlurmSystem], [UCCTest], UCCTestSlurmCommandGenStrategy)
 Registry().add_strategy(ReportGenerationStrategy, [SlurmSystem], [ChakraReplay], ChakraReplayReportGenerationStrategy)
 Registry().add_strategy(GradingStrategy, [SlurmSystem], [ChakraReplay], ChakraReplayGradingStrategy)
 Registry().add_strategy(CommandGenStrategy, [SlurmSystem], [ChakraReplay], ChakraReplaySlurmCommandGenStrategy)
-Registry().add_strategy(
-    CommandGenStrategy, [SlurmSystem], [GenericSlurmContainer], GenericSlurmContainerCommandGenStrategy
-)
+Registry().add_strategy(CommandGenStrategy, [SlurmSystem], [SlurmContainer], SlurmContainerCommandGenStrategy)
 
 Registry().add_installer("slurm", SlurmInstaller)
 Registry().add_installer("standalone", StandaloneInstaller)
@@ -193,7 +191,7 @@
 Registry().add_test_template("JaxToolboxGPT", JaxToolbox)
 Registry().add_test_template("JaxToolboxGrok", JaxToolbox)
 Registry().add_test_template("JaxToolboxNemotron", JaxToolbox)
-Registry().add_test_template("GenericSlurmContainer", GenericSlurmContainer)
+Registry().add_test_template("GenericSlurmContainer", SlurmContainer)
 
 __all__ = [
     "BaseInstaller",
diff --git a/src/cloudai/schema/test_template/generic_slurm_container/report_generation_strategy.py b/src/cloudai/schema/test_template/generic_slurm_container/report_generation_strategy.py
index 82cc00a9..c7c4554d 100644
--- a/src/cloudai/schema/test_template/generic_slurm_container/report_generation_strategy.py
+++ b/src/cloudai/schema/test_template/generic_slurm_container/report_generation_strategy.py
@@ -21,7 +21,7 @@
 from cloudai import ReportGenerationStrategy
 
 
-class GenericSlurmContainerReportGenerationStrategy(ReportGenerationStrategy):
+class SlurmContainerReportGenerationStrategy(ReportGenerationStrategy):
     """Report generation strategy for a generic Slurm container test."""
 
     def can_handle_directory(self, directory_path: Path) -> bool:
diff --git a/src/cloudai/schema/test_template/generic_slurm_container/slurm_command_gen_strategy.py b/src/cloudai/schema/test_template/generic_slurm_container/slurm_command_gen_strategy.py
index 76dab8ca..dd43aa18 100644
--- a/src/cloudai/schema/test_template/generic_slurm_container/slurm_command_gen_strategy.py
+++ b/src/cloudai/schema/test_template/generic_slurm_container/slurm_command_gen_strategy.py
@@ -19,10 +19,10 @@
 
 from cloudai import TestRun
 from cloudai.systems.slurm.strategy import SlurmCommandGenStrategy
-from cloudai.test_definitions.generic_slurm_container import SlurmContainerTestDefinition
+from cloudai.test_definitions.slurm_container import SlurmContainerTestDefinition
 
 
-class GenericSlurmContainerCommandGenStrategy(SlurmCommandGenStrategy):
+class SlurmContainerCommandGenStrategy(SlurmCommandGenStrategy):
     """Command generation strategy for generic Slurm container tests."""
 
     def generate_srun_prefix(self, slurm_args: dict[str, Any], tr: TestRun) -> list[str]:
diff --git a/src/cloudai/schema/test_template/generic_slurm_container/template.py b/src/cloudai/schema/test_template/generic_slurm_container/template.py
index f8b8cb30..9e49eb35 100644
--- a/src/cloudai/schema/test_template/generic_slurm_container/template.py
+++ b/src/cloudai/schema/test_template/generic_slurm_container/template.py
@@ -17,7 +17,7 @@
 from cloudai import TestTemplate
 
 
-class GenericSlurmContainer(TestTemplate):
+class SlurmContainer(TestTemplate):
     """Generic Slurm container test template."""
 
     pass
diff --git a/src/cloudai/test_definitions/generic_slurm_container.py b/src/cloudai/test_definitions/slurm_container.py
similarity index 100%
rename from src/cloudai/test_definitions/generic_slurm_container.py
rename to src/cloudai/test_definitions/slurm_container.py
diff --git a/tests/test_init.py b/tests/test_init.py
index e6f8e2bb..5ca6c5f2 100644
--- a/tests/test_init.py
+++ b/tests/test_init.py
@@ -18,12 +18,12 @@
 
 from cloudai import (
     CommandGenStrategy,
-    GenericSlurmContainer,
     GradingStrategy,
     JobIdRetrievalStrategy,
     JsonGenStrategy,
     Registry,
     ReportGenerationStrategy,
+    SlurmContainer,
     SlurmContainerTestDefinition,
 )
 from cloudai.installer.slurm_installer import SlurmInstaller
@@ -135,7 +135,7 @@ def test_test_templates():
     assert test_templates["NeMoLauncher"] == NeMoLauncher
     assert test_templates["Sleep"] == Sleep
     assert test_templates["UCCTest"] == UCCTest
-    assert test_templates["GenericSlurmContainer"] == GenericSlurmContainer
+    assert test_templates["GenericSlurmContainer"] == SlurmContainer
 
 
 def test_installers():

From 30df9c439adcf9c1f2e5fd8e4f918a61e9f34b56 Mon Sep 17 00:00:00 2001
From: Andrey Maslennikov <andreyma@nvidia.com>
Date: Mon, 11 Nov 2024 13:19:08 +0100
Subject: [PATCH 14/21] Simplify naming and after-merge updates

---
 conf/new/test/nemo-vfm-mock_dit7b_65k.toml     |  2 +-
 conf/new/test/nemo-vfm-mock_dit7b_8k.toml      |  2 +-
 .../test/nemo-vfm-mock_ditllama28b_65k.toml    |  2 +-
 .../new/test/nemo-vfm-mock_ditllama28b_8k.toml |  2 +-
 src/cloudai/__init__.py                        | 18 +++++++++---------
 .../report_generation_strategy.py              |  0
 .../slurm_command_gen_strategy.py              |  6 +++---
 .../template.py                                |  0
 .../strategy/slurm_command_gen_strategy.py     |  4 ++--
 tests/test_init.py                             |  4 ++--
 10 files changed, 20 insertions(+), 20 deletions(-)
 rename src/cloudai/schema/test_template/{generic_slurm_container => slurm_container}/report_generation_strategy.py (100%)
 rename src/cloudai/schema/test_template/{generic_slurm_container => slurm_container}/slurm_command_gen_strategy.py (90%)
 rename src/cloudai/schema/test_template/{generic_slurm_container => slurm_container}/template.py (100%)

diff --git a/conf/new/test/nemo-vfm-mock_dit7b_65k.toml b/conf/new/test/nemo-vfm-mock_dit7b_65k.toml
index deb12387..e8fe485f 100644
--- a/conf/new/test/nemo-vfm-mock_dit7b_65k.toml
+++ b/conf/new/test/nemo-vfm-mock_dit7b_65k.toml
@@ -16,7 +16,7 @@
 
 name = "nemo-vfm-mock_dit7b_65k"
 description = "Nemo VFM factory=mock_dit7b_65k"
-test_template_name = "GenericSlurmContainer"
+test_template_name = "SlurmContainer"
 
 [cmd_args]
 docker_image_url = "gitlab-master.nvidia.com/dl/nemo/nemo-vfm:24.10"
diff --git a/conf/new/test/nemo-vfm-mock_dit7b_8k.toml b/conf/new/test/nemo-vfm-mock_dit7b_8k.toml
index 02f45c13..1d7bcd5b 100644
--- a/conf/new/test/nemo-vfm-mock_dit7b_8k.toml
+++ b/conf/new/test/nemo-vfm-mock_dit7b_8k.toml
@@ -16,7 +16,7 @@
 
 name = "nemo-vfm-mock_dit7b_8k"
 description = "Nemo VFM factory=mock_dit7b_8k"
-test_template_name = "GenericSlurmContainer"
+test_template_name = "SlurmContainer"
 
 [cmd_args]
 docker_image_url = "gitlab-master.nvidia.com/dl/nemo/nemo-vfm:24.10"
diff --git a/conf/new/test/nemo-vfm-mock_ditllama28b_65k.toml b/conf/new/test/nemo-vfm-mock_ditllama28b_65k.toml
index fec9c947..6f78c14b 100644
--- a/conf/new/test/nemo-vfm-mock_ditllama28b_65k.toml
+++ b/conf/new/test/nemo-vfm-mock_ditllama28b_65k.toml
@@ -16,7 +16,7 @@
 
 name = "nemo-vfm-mock_ditllama28b_65k"
 description = "Nemo VFM factory=mock_ditllama28b_65k"
-test_template_name = "GenericSlurmContainer"
+test_template_name = "SlurmContainer"
 
 [cmd_args]
 docker_image_url = "gitlab-master.nvidia.com/dl/nemo/nemo-vfm:24.10"
diff --git a/conf/new/test/nemo-vfm-mock_ditllama28b_8k.toml b/conf/new/test/nemo-vfm-mock_ditllama28b_8k.toml
index 9ff187f2..2083ad69 100644
--- a/conf/new/test/nemo-vfm-mock_ditllama28b_8k.toml
+++ b/conf/new/test/nemo-vfm-mock_ditllama28b_8k.toml
@@ -16,7 +16,7 @@
 
 name = "nemo-vfm-mock_ditllama28b_8k"
 description = "Nemo VFM factory=mock_ditllama28b_8k"
-test_template_name = "GenericSlurmContainer"
+test_template_name = "SlurmContainer"
 
 [cmd_args]
 docker_image_url = "gitlab-master.nvidia.com/dl/nemo/nemo-vfm:24.10"
diff --git a/src/cloudai/__init__.py b/src/cloudai/__init__.py
index fe695281..fc4dc7a0 100644
--- a/src/cloudai/__init__.py
+++ b/src/cloudai/__init__.py
@@ -57,13 +57,6 @@
 from .schema.test_template.common.default_job_status_retrieval_strategy import DefaultJobStatusRetrievalStrategy
 from .schema.test_template.common.slurm_job_id_retrieval_strategy import SlurmJobIdRetrievalStrategy
 from .schema.test_template.common.standalone_job_id_retrieval_strategy import StandaloneJobIdRetrievalStrategy
-from .schema.test_template.generic_slurm_container.report_generation_strategy import (
-    SlurmContainerReportGenerationStrategy,
-)
-from .schema.test_template.generic_slurm_container.slurm_command_gen_strategy import (
-    SlurmContainerCommandGenStrategy,
-)
-from .schema.test_template.generic_slurm_container.template import SlurmContainer
 from .schema.test_template.jax_toolbox.grading_strategy import JaxToolboxGradingStrategy
 from .schema.test_template.jax_toolbox.job_status_retrieval_strategy import JaxToolboxJobStatusRetrievalStrategy
 from .schema.test_template.jax_toolbox.report_generation_strategy import JaxToolboxReportGenerationStrategy
@@ -88,6 +81,13 @@
 from .schema.test_template.sleep.slurm_command_gen_strategy import SleepSlurmCommandGenStrategy
 from .schema.test_template.sleep.standalone_command_gen_strategy import SleepStandaloneCommandGenStrategy
 from .schema.test_template.sleep.template import Sleep
+from .schema.test_template.slurm_container.report_generation_strategy import (
+    SlurmContainerReportGenerationStrategy,
+)
+from .schema.test_template.slurm_container.slurm_command_gen_strategy import (
+    SlurmContainerCommandGenStrategy,
+)
+from .schema.test_template.slurm_container.template import SlurmContainer
 from .schema.test_template.ucc_test.grading_strategy import UCCTestGradingStrategy
 from .schema.test_template.ucc_test.report_generation_strategy import UCCTestReportGenerationStrategy
 from .schema.test_template.ucc_test.slurm_command_gen_strategy import UCCTestSlurmCommandGenStrategy
@@ -181,7 +181,7 @@
 Registry().add_test_definition("JaxToolboxGPT", GPTTestDefinition)
 Registry().add_test_definition("JaxToolboxGrok", GrokTestDefinition)
 Registry().add_test_definition("JaxToolboxNemotron", NemotronTestDefinition)
-Registry().add_test_definition("GenericSlurmContainer", SlurmContainerTestDefinition)
+Registry().add_test_definition("SlurmContainer", SlurmContainerTestDefinition)
 
 Registry().add_test_template("ChakraReplay", ChakraReplay)
 Registry().add_test_template("NcclTest", NcclTest)
@@ -191,7 +191,7 @@
 Registry().add_test_template("JaxToolboxGPT", JaxToolbox)
 Registry().add_test_template("JaxToolboxGrok", JaxToolbox)
 Registry().add_test_template("JaxToolboxNemotron", JaxToolbox)
-Registry().add_test_template("GenericSlurmContainer", SlurmContainer)
+Registry().add_test_template("SlurmContainer", SlurmContainer)
 
 __all__ = [
     "BaseInstaller",
diff --git a/src/cloudai/schema/test_template/generic_slurm_container/report_generation_strategy.py b/src/cloudai/schema/test_template/slurm_container/report_generation_strategy.py
similarity index 100%
rename from src/cloudai/schema/test_template/generic_slurm_container/report_generation_strategy.py
rename to src/cloudai/schema/test_template/slurm_container/report_generation_strategy.py
diff --git a/src/cloudai/schema/test_template/generic_slurm_container/slurm_command_gen_strategy.py b/src/cloudai/schema/test_template/slurm_container/slurm_command_gen_strategy.py
similarity index 90%
rename from src/cloudai/schema/test_template/generic_slurm_container/slurm_command_gen_strategy.py
rename to src/cloudai/schema/test_template/slurm_container/slurm_command_gen_strategy.py
index dd43aa18..162febdf 100644
--- a/src/cloudai/schema/test_template/generic_slurm_container/slurm_command_gen_strategy.py
+++ b/src/cloudai/schema/test_template/slurm_container/slurm_command_gen_strategy.py
@@ -25,15 +25,15 @@
 class SlurmContainerCommandGenStrategy(SlurmCommandGenStrategy):
     """Command generation strategy for generic Slurm container tests."""
 
-    def generate_srun_prefix(self, slurm_args: dict[str, Any], tr: TestRun) -> list[str]:
+    def gen_srun_prefix(self, slurm_args: dict[str, Any], tr: TestRun) -> list[str]:
         tdef: SlurmContainerTestDefinition = cast(SlurmContainerTestDefinition, tr.test.test_definition)
         slurm_args["image_path"] = tdef.docker_image.installed_path
         repo_path = tdef.git_repo.installed_path or Path.cwd()
         mcore_vfm_path = tdef.mcore_vfm_git_repo.installed_path or Path.cwd()
         slurm_args["container_mounts"] = f"{repo_path.absolute()}:/work,{mcore_vfm_path.absolute()}:/opt/megatron-lm"
 
-        cmd = super().generate_srun_prefix(slurm_args, tr)
-        return cmd + ["--no-container-mount-home"]
+        cmd = super().gen_srun_prefix(slurm_args, tr)
+        return cmd + ["--no-container-mount-home "]
 
     def generate_test_command(self, env_vars: dict[str, str], cmd_args: dict[str, str], tr: TestRun) -> list[str]:
         srun_command_parts: list[str] = []
diff --git a/src/cloudai/schema/test_template/generic_slurm_container/template.py b/src/cloudai/schema/test_template/slurm_container/template.py
similarity index 100%
rename from src/cloudai/schema/test_template/generic_slurm_container/template.py
rename to src/cloudai/schema/test_template/slurm_container/template.py
diff --git a/src/cloudai/systems/slurm/strategy/slurm_command_gen_strategy.py b/src/cloudai/systems/slurm/strategy/slurm_command_gen_strategy.py
index ee8a463a..910db56c 100644
--- a/src/cloudai/systems/slurm/strategy/slurm_command_gen_strategy.py
+++ b/src/cloudai/systems/slurm/strategy/slurm_command_gen_strategy.py
@@ -195,11 +195,11 @@ def gen_post_test(self, post_test: TestScenario, base_output_path: Path) -> str:
     def _gen_srun_command(
         self, slurm_args: Dict[str, Any], env_vars: Dict[str, str], cmd_args: Dict[str, str], tr: TestRun
     ) -> str:
-        srun_command_parts = self.gen_srun_prefix(slurm_args)
+        srun_command_parts = self.gen_srun_prefix(slurm_args, tr)
         test_command_parts = self.generate_test_command(env_vars, cmd_args, tr)
         return " ".join(srun_command_parts + test_command_parts)
 
-    def gen_srun_prefix(self, slurm_args: Dict[str, Any]) -> List[str]:
+    def gen_srun_prefix(self, slurm_args: Dict[str, Any], tr: TestRun) -> List[str]:
         srun_command_parts = ["srun", f"--mpi={self.system.mpi}"]
         if slurm_args.get("image_path"):
             srun_command_parts.append(f'--container-image={slurm_args["image_path"]}')
diff --git a/tests/test_init.py b/tests/test_init.py
index 5ca6c5f2..07ca9268 100644
--- a/tests/test_init.py
+++ b/tests/test_init.py
@@ -135,7 +135,7 @@ def test_test_templates():
     assert test_templates["NeMoLauncher"] == NeMoLauncher
     assert test_templates["Sleep"] == Sleep
     assert test_templates["UCCTest"] == UCCTest
-    assert test_templates["GenericSlurmContainer"] == SlurmContainer
+    assert test_templates["SlurmContainer"] == SlurmContainer
 
 
 def test_installers():
@@ -153,7 +153,7 @@ def test_definitions():
     assert test_defs["ChakraReplay"] == ChakraReplayTestDefinition
     assert test_defs["Sleep"] == SleepTestDefinition
     assert test_defs["NeMoLauncher"] == NeMoLauncherTestDefinition
-    assert test_defs["GenericSlurmContainer"] == SlurmContainerTestDefinition
+    assert test_defs["SlurmContainer"] == SlurmContainerTestDefinition
 
 
 def test_definitions_matches_templates():

From d57d76e897005b859b56f30b7d28023506906b16 Mon Sep 17 00:00:00 2001
From: Andrey Maslennikov <andreyma@nvidia.com>
Date: Mon, 11 Nov 2024 13:28:24 +0100
Subject: [PATCH 15/21] Add acceptance test for slurm_container

---
 .../slurm_command_gen_strategy.py             |  2 +-
 tests/test_acceptance.py                      | 33 ++++++++++++++++++-
 2 files changed, 33 insertions(+), 2 deletions(-)

diff --git a/src/cloudai/schema/test_template/slurm_container/slurm_command_gen_strategy.py b/src/cloudai/schema/test_template/slurm_container/slurm_command_gen_strategy.py
index 162febdf..7b4fe63b 100644
--- a/src/cloudai/schema/test_template/slurm_container/slurm_command_gen_strategy.py
+++ b/src/cloudai/schema/test_template/slurm_container/slurm_command_gen_strategy.py
@@ -33,7 +33,7 @@ def gen_srun_prefix(self, slurm_args: dict[str, Any], tr: TestRun) -> list[str]:
         slurm_args["container_mounts"] = f"{repo_path.absolute()}:/work,{mcore_vfm_path.absolute()}:/opt/megatron-lm"
 
         cmd = super().gen_srun_prefix(slurm_args, tr)
-        return cmd + ["--no-container-mount-home "]
+        return cmd + ["--no-container-mount-home"]
 
     def generate_test_command(self, env_vars: dict[str, str], cmd_args: dict[str, str], tr: TestRun) -> list[str]:
         srun_command_parts: list[str] = []
diff --git a/tests/test_acceptance.py b/tests/test_acceptance.py
index d1e57782..38bbb39a 100644
--- a/tests/test_acceptance.py
+++ b/tests/test_acceptance.py
@@ -29,12 +29,15 @@
 from cloudai.schema.test_template.nccl_test.slurm_command_gen_strategy import NcclTestSlurmCommandGenStrategy
 from cloudai.schema.test_template.sleep.slurm_command_gen_strategy import SleepSlurmCommandGenStrategy
 from cloudai.schema.test_template.sleep.template import Sleep
+from cloudai.schema.test_template.slurm_container.slurm_command_gen_strategy import SlurmContainerCommandGenStrategy
+from cloudai.schema.test_template.slurm_container.template import SlurmContainer
 from cloudai.schema.test_template.ucc_test.slurm_command_gen_strategy import UCCTestSlurmCommandGenStrategy
 from cloudai.systems import SlurmSystem
 from cloudai.test_definitions.gpt import GPTCmdArgs, GPTTestDefinition
 from cloudai.test_definitions.grok import GrokCmdArgs, GrokTestDefinition
 from cloudai.test_definitions.nccl import NCCLCmdArgs, NCCLTestDefinition
 from cloudai.test_definitions.sleep import SleepCmdArgs, SleepTestDefinition
+from cloudai.test_definitions.slurm_container import SlurmContainerCmdArgs, SlurmContainerTestDefinition
 from cloudai.test_definitions.ucc import UCCCmdArgs, UCCTestDefinition
 
 SLURM_TEST_SCENARIOS = [
@@ -91,7 +94,9 @@ def partial_tr(slurm_system: SlurmSystem) -> partial[TestRun]:
     return partial(TestRun, num_nodes=1, nodes=[], output_path=slurm_system.output_path)
 
 
-@pytest.fixture(params=["ucc", "nccl", "sleep", "gpt-pre-test", "gpt-no-hook", "grok-pre-test", "grok-no-hook"])
+@pytest.fixture(
+    params=["ucc", "nccl", "sleep", "gpt-pre-test", "gpt-no-hook", "grok-pre-test", "grok-no-hook", "slurm_container"]
+)
 def test_req(request, slurm_system: SlurmSystem, partial_tr: partial[TestRun]) -> tuple[TestRun, str, Optional[str]]:
     if request.param == "ucc":
         tr = partial_tr(
@@ -211,6 +216,32 @@ def test_req(request, slurm_system: SlurmSystem, partial_tr: partial[TestRun]) -
             tr.pre_test = TestScenario(name=f"{pre_test_tr.name} NCCL pre-test", test_runs=[pre_test_tr])
 
         return (tr, f"{request.param}.sbatch", "grok.run")
+    elif request.param == "slurm_container":
+        tr = partial_tr(
+            name="slurm_container",
+            test=Test(
+                test_definition=SlurmContainerTestDefinition(
+                    name="slurm_container",
+                    description="slurm_container",
+                    test_template_name="slurm_container",
+                    cmd_args=SlurmContainerCmdArgs(
+                        docker_image_url="https://docker/url",
+                        repository_url="https://repo/url",
+                        repository_commit_hash="commit_hash",
+                        mcore_vfm_repo="https://mcore_vfm/repo",
+                        mcore_vfm_commit_hash="mcore_vfm_commit_hash",
+                    ),
+                    extra_cmd_args={"bash": '-c "pwd ; ls"'},
+                ),
+                test_template=SlurmContainer(slurm_system, name="slurm_container"),
+            ),
+        )
+        tr.test.test_template.command_gen_strategy = SlurmContainerCommandGenStrategy(
+            slurm_system, tr.test.test_definition.cmd_args_dict
+        )
+        tr.test.test_template.command_gen_strategy.job_name = Mock(return_value="job_name")
+
+        return (tr, "slurm_container.sbatch", None)
 
     raise ValueError(f"Unknown test: {request.param}")
 

From 5d1b81c87e14e99d31289b2c571e780f7bd473da Mon Sep 17 00:00:00 2001
From: Andrey Maslennikov <andreyma@nvidia.com>
Date: Mon, 11 Nov 2024 15:11:59 +0100
Subject: [PATCH 16/21] Add missing ref file

---
 tests/ref_data/slurm_container.sbatch | 11 +++++++++++
 1 file changed, 11 insertions(+)
 create mode 100644 tests/ref_data/slurm_container.sbatch

diff --git a/tests/ref_data/slurm_container.sbatch b/tests/ref_data/slurm_container.sbatch
new file mode 100644
index 00000000..b959d148
--- /dev/null
+++ b/tests/ref_data/slurm_container.sbatch
@@ -0,0 +1,11 @@
+#!/bin/bash
+#SBATCH --job-name=__JOB_NAME__
+#SBATCH -N 1
+#SBATCH --output=__OUTPUT_DIR__/stdout.txt
+#SBATCH --error=__OUTPUT_DIR__/stderr.txt
+#SBATCH --partition=main
+
+export SLURM_JOB_MASTER_NODE=$(scontrol show hostname $SLURM_JOB_NODELIST | head -n 1)
+
+
+srun --mpi=pmix --container-image=https://docker/url --container-mounts=/Users/andreyma/workspace/nvidia/cloudai:/work,/Users/andreyma/workspace/nvidia/cloudai:/opt/megatron-lm --no-container-mount-home bash -c "pwd ; ls"
\ No newline at end of file

From d83c8d386b194b17c6de19c31f8c1e6d08dd85a9 Mon Sep 17 00:00:00 2001
From: Andrey Maslennikov <andreyma@nvidia.com>
Date: Mon, 11 Nov 2024 15:25:24 +0100
Subject: [PATCH 17/21] Use system-wide root for __OUTPUT_DIR__ template

---
 .../slurm_container/slurm_command_gen_strategy.py      |  6 ++++--
 tests/ref_data/gpt-no-hook.sbatch                      |  6 +++---
 tests/ref_data/gpt-pre-test.sbatch                     | 10 +++++-----
 tests/ref_data/grok-no-hook.sbatch                     |  6 +++---
 tests/ref_data/grok-pre-test.sbatch                    | 10 +++++-----
 tests/ref_data/nccl.sbatch                             |  4 ++--
 tests/ref_data/sleep.sbatch                            |  4 ++--
 tests/ref_data/slurm_container.sbatch                  |  6 +++---
 tests/ref_data/ucc.sbatch                              |  4 ++--
 tests/test_acceptance.py                               |  2 +-
 10 files changed, 30 insertions(+), 28 deletions(-)

diff --git a/src/cloudai/schema/test_template/slurm_container/slurm_command_gen_strategy.py b/src/cloudai/schema/test_template/slurm_container/slurm_command_gen_strategy.py
index 7b4fe63b..5863d06f 100644
--- a/src/cloudai/schema/test_template/slurm_container/slurm_command_gen_strategy.py
+++ b/src/cloudai/schema/test_template/slurm_container/slurm_command_gen_strategy.py
@@ -28,8 +28,10 @@ class SlurmContainerCommandGenStrategy(SlurmCommandGenStrategy):
     def gen_srun_prefix(self, slurm_args: dict[str, Any], tr: TestRun) -> list[str]:
         tdef: SlurmContainerTestDefinition = cast(SlurmContainerTestDefinition, tr.test.test_definition)
         slurm_args["image_path"] = tdef.docker_image.installed_path
-        repo_path = tdef.git_repo.installed_path or Path.cwd()
-        mcore_vfm_path = tdef.mcore_vfm_git_repo.installed_path or Path.cwd()
+        repo_path = tdef.git_repo.installed_path or self.system.install_path / tdef.git_repo.repo_name
+        mcore_vfm_path = (
+            tdef.mcore_vfm_git_repo.installed_path or self.system.install_path / tdef.mcore_vfm_git_repo.repo_name
+        )
         slurm_args["container_mounts"] = f"{repo_path.absolute()}:/work,{mcore_vfm_path.absolute()}:/opt/megatron-lm"
 
         cmd = super().gen_srun_prefix(slurm_args, tr)
diff --git a/tests/ref_data/gpt-no-hook.sbatch b/tests/ref_data/gpt-no-hook.sbatch
index f01e9222..77999bda 100644
--- a/tests/ref_data/gpt-no-hook.sbatch
+++ b/tests/ref_data/gpt-no-hook.sbatch
@@ -15,8 +15,8 @@ echo "Loading container with srun command"
     --mpi=none \
      \
     --export=ALL \
-    -o __OUTPUT_DIR__/output-%j-%n-%t.txt \
-    -e __OUTPUT_DIR__/error-%j-%n-%t.txt \
+    -o __OUTPUT_DIR__/output/output-%j-%n-%t.txt \
+    -e __OUTPUT_DIR__/output/error-%j-%n-%t.txt \
     --container-name=cont \
-    --container-mounts=__OUTPUT_DIR__:/opt/paxml/workspace/ \
+    --container-mounts=__OUTPUT_DIR__/output:/opt/paxml/workspace/ \
     /opt/paxml/workspace/run.sh
diff --git a/tests/ref_data/gpt-pre-test.sbatch b/tests/ref_data/gpt-pre-test.sbatch
index c0f6114f..d21f0ed7 100644
--- a/tests/ref_data/gpt-pre-test.sbatch
+++ b/tests/ref_data/gpt-pre-test.sbatch
@@ -8,8 +8,8 @@ export COMBINE_THRESHOLD=1
 export PER_GPU_COMBINE_THRESHOLD=0
 export XLA_FLAGS="--xla_gpu_all_gather_combine_threshold_bytes=$COMBINE_THRESHOLD --xla_gpu_all_reduce_combine_threshold_bytes=$COMBINE_THRESHOLD --xla_gpu_reduce_scatter_combine_threshold_bytes=$PER_GPU_COMBINE_THRESHOLD"
 
-srun --output=__OUTPUT_DIR__/pre_test/nccl/stdout.txt --error=__OUTPUT_DIR__/pre_test/nccl/stderr.txt --mpi=pmix --container-image=nvcr.io/nvidia/pytorch:24.02-py3 /usr/local/bin/all_reduce_perf_mpi --nthreads 1 --ngpus 1 --minbytes 32M --maxbytes 32M --stepbytes 1M --op sum --datatype float --root 0 --iters 20 --warmup_iters 5 --agg_iters 1 --average 1 --parallel_init 0 --check 1 --blocking 0 --cudagraph 0
-SUCCESS_0=$(grep -q "Avg bus bandwidth" __OUTPUT_DIR__/pre_test/nccl/stdout.txt && echo 1 || echo 0)
+srun --output=__OUTPUT_DIR__/output/pre_test/nccl/stdout.txt --error=__OUTPUT_DIR__/output/pre_test/nccl/stderr.txt --mpi=pmix --container-image=nvcr.io/nvidia/pytorch:24.02-py3 /usr/local/bin/all_reduce_perf_mpi --nthreads 1 --ngpus 1 --minbytes 32M --maxbytes 32M --stepbytes 1M --op sum --datatype float --root 0 --iters 20 --warmup_iters 5 --agg_iters 1 --average 1 --parallel_init 0 --check 1 --blocking 0 --cudagraph 0
+SUCCESS_0=$(grep -q "Avg bus bandwidth" __OUTPUT_DIR__/output/pre_test/nccl/stdout.txt && echo 1 || echo 0)
 PRE_TEST_SUCCESS=$( [ $SUCCESS_0 -eq 1 ] && echo 1 || echo 0 )
 if [ $PRE_TEST_SUCCESS -eq 1 ]; then
         echo "Loading container with srun command"
@@ -19,9 +19,9 @@ if [ $PRE_TEST_SUCCESS -eq 1 ]; then
     --mpi=none \
      \
     --export=ALL \
-    -o __OUTPUT_DIR__/output-%j-%n-%t.txt \
-    -e __OUTPUT_DIR__/error-%j-%n-%t.txt \
+    -o __OUTPUT_DIR__/output/output-%j-%n-%t.txt \
+    -e __OUTPUT_DIR__/output/error-%j-%n-%t.txt \
     --container-name=cont \
-    --container-mounts=__OUTPUT_DIR__:/opt/paxml/workspace/ \
+    --container-mounts=__OUTPUT_DIR__/output:/opt/paxml/workspace/ \
     /opt/paxml/workspace/run.sh
 fi
diff --git a/tests/ref_data/grok-no-hook.sbatch b/tests/ref_data/grok-no-hook.sbatch
index 7e7adfc2..8d008611 100644
--- a/tests/ref_data/grok-no-hook.sbatch
+++ b/tests/ref_data/grok-no-hook.sbatch
@@ -15,8 +15,8 @@ echo "Loading container with srun command"
     --mpi=none \
      \
     --export=ALL \
-    -o __OUTPUT_DIR__/output-%j-%n-%t.txt \
-    -e __OUTPUT_DIR__/error-%j-%n-%t.txt \
+    -o __OUTPUT_DIR__/output/output-%j-%n-%t.txt \
+    -e __OUTPUT_DIR__/output/error-%j-%n-%t.txt \
     --container-name=cont \
-    --container-mounts=__OUTPUT_DIR__:/opt/paxml/workspace/ \
+    --container-mounts=__OUTPUT_DIR__/output:/opt/paxml/workspace/ \
     /opt/paxml/workspace/run.sh
diff --git a/tests/ref_data/grok-pre-test.sbatch b/tests/ref_data/grok-pre-test.sbatch
index 51730bd7..7d88745a 100644
--- a/tests/ref_data/grok-pre-test.sbatch
+++ b/tests/ref_data/grok-pre-test.sbatch
@@ -8,8 +8,8 @@ export COMBINE_THRESHOLD=1
 export PER_GPU_COMBINE_THRESHOLD=0
 export XLA_FLAGS="--xla_disable_hlo_passes=rematerialization --xla_dump_hlo_pass_re=.* --xla_gpu_all_gather_combine_threshold_bytes=$COMBINE_THRESHOLD --xla_gpu_all_reduce_combine_threshold_bytes=$COMBINE_THRESHOLD --xla_gpu_enable_all_gather_combine_by_dim=false --xla_gpu_enable_highest_priority_async_stream=true --xla_gpu_enable_latency_hiding_scheduler=true --xla_gpu_enable_pipelined_all_gather=true --xla_gpu_enable_pipelined_all_reduce=true --xla_gpu_enable_pipelined_reduce_scatter=true --xla_gpu_enable_reduce_scatter_combine_by_dim=false --xla_gpu_enable_triton_gemm=false --xla_gpu_enable_triton_softmax_fusion=false --xla_gpu_enable_while_loop_double_buffering=true --xla_gpu_graph_level=0 --xla_gpu_pgle_profile_file_or_directory_path=/opt/paxml/workspace/pgle_output_profile.pbtxt --xla_gpu_reduce_scatter_combine_threshold_bytes=$PER_GPU_COMBINE_THRESHOLD --xla_gpu_run_post_layout_collective_pipeliner=false --xla_gpu_use_memcpy_local_p2p=false"
 
-srun --output=__OUTPUT_DIR__/pre_test/nccl/stdout.txt --error=__OUTPUT_DIR__/pre_test/nccl/stderr.txt --mpi=pmix --container-image=nvcr.io/nvidia/pytorch:24.02-py3 /usr/local/bin/all_reduce_perf_mpi --nthreads 1 --ngpus 1 --minbytes 32M --maxbytes 32M --stepbytes 1M --op sum --datatype float --root 0 --iters 20 --warmup_iters 5 --agg_iters 1 --average 1 --parallel_init 0 --check 1 --blocking 0 --cudagraph 0
-SUCCESS_0=$(grep -q "Avg bus bandwidth" __OUTPUT_DIR__/pre_test/nccl/stdout.txt && echo 1 || echo 0)
+srun --output=__OUTPUT_DIR__/output/pre_test/nccl/stdout.txt --error=__OUTPUT_DIR__/output/pre_test/nccl/stderr.txt --mpi=pmix --container-image=nvcr.io/nvidia/pytorch:24.02-py3 /usr/local/bin/all_reduce_perf_mpi --nthreads 1 --ngpus 1 --minbytes 32M --maxbytes 32M --stepbytes 1M --op sum --datatype float --root 0 --iters 20 --warmup_iters 5 --agg_iters 1 --average 1 --parallel_init 0 --check 1 --blocking 0 --cudagraph 0
+SUCCESS_0=$(grep -q "Avg bus bandwidth" __OUTPUT_DIR__/output/pre_test/nccl/stdout.txt && echo 1 || echo 0)
 PRE_TEST_SUCCESS=$( [ $SUCCESS_0 -eq 1 ] && echo 1 || echo 0 )
 if [ $PRE_TEST_SUCCESS -eq 1 ]; then
         echo "Loading container with srun command"
@@ -19,9 +19,9 @@ if [ $PRE_TEST_SUCCESS -eq 1 ]; then
     --mpi=none \
      \
     --export=ALL \
-    -o __OUTPUT_DIR__/output-%j-%n-%t.txt \
-    -e __OUTPUT_DIR__/error-%j-%n-%t.txt \
+    -o __OUTPUT_DIR__/output/output-%j-%n-%t.txt \
+    -e __OUTPUT_DIR__/output/error-%j-%n-%t.txt \
     --container-name=cont \
-    --container-mounts=__OUTPUT_DIR__:/opt/paxml/workspace/ \
+    --container-mounts=__OUTPUT_DIR__/output:/opt/paxml/workspace/ \
     /opt/paxml/workspace/run.sh
 fi
diff --git a/tests/ref_data/nccl.sbatch b/tests/ref_data/nccl.sbatch
index dc179ba9..2a9f57b4 100644
--- a/tests/ref_data/nccl.sbatch
+++ b/tests/ref_data/nccl.sbatch
@@ -1,8 +1,8 @@
 #!/bin/bash
 #SBATCH --job-name=__JOB_NAME__
 #SBATCH -N 1
-#SBATCH --output=__OUTPUT_DIR__/stdout.txt
-#SBATCH --error=__OUTPUT_DIR__/stderr.txt
+#SBATCH --output=__OUTPUT_DIR__/output/stdout.txt
+#SBATCH --error=__OUTPUT_DIR__/output/stderr.txt
 #SBATCH --partition=main
 
 export SLURM_JOB_MASTER_NODE=$(scontrol show hostname $SLURM_JOB_NODELIST | head -n 1)
diff --git a/tests/ref_data/sleep.sbatch b/tests/ref_data/sleep.sbatch
index 9262001b..1ce9ca32 100644
--- a/tests/ref_data/sleep.sbatch
+++ b/tests/ref_data/sleep.sbatch
@@ -1,8 +1,8 @@
 #!/bin/bash
 #SBATCH --job-name=__JOB_NAME__
 #SBATCH -N 1
-#SBATCH --output=__OUTPUT_DIR__/stdout.txt
-#SBATCH --error=__OUTPUT_DIR__/stderr.txt
+#SBATCH --output=__OUTPUT_DIR__/output/stdout.txt
+#SBATCH --error=__OUTPUT_DIR__/output/stderr.txt
 #SBATCH --partition=main
 
 export SLURM_JOB_MASTER_NODE=$(scontrol show hostname $SLURM_JOB_NODELIST | head -n 1)
diff --git a/tests/ref_data/slurm_container.sbatch b/tests/ref_data/slurm_container.sbatch
index b959d148..6479402b 100644
--- a/tests/ref_data/slurm_container.sbatch
+++ b/tests/ref_data/slurm_container.sbatch
@@ -1,11 +1,11 @@
 #!/bin/bash
 #SBATCH --job-name=__JOB_NAME__
 #SBATCH -N 1
-#SBATCH --output=__OUTPUT_DIR__/stdout.txt
-#SBATCH --error=__OUTPUT_DIR__/stderr.txt
+#SBATCH --output=__OUTPUT_DIR__/output/stdout.txt
+#SBATCH --error=__OUTPUT_DIR__/output/stderr.txt
 #SBATCH --partition=main
 
 export SLURM_JOB_MASTER_NODE=$(scontrol show hostname $SLURM_JOB_NODELIST | head -n 1)
 
 
-srun --mpi=pmix --container-image=https://docker/url --container-mounts=/Users/andreyma/workspace/nvidia/cloudai:/work,/Users/andreyma/workspace/nvidia/cloudai:/opt/megatron-lm --no-container-mount-home bash -c "pwd ; ls"
\ No newline at end of file
+srun --mpi=pmix --container-image=https://docker/url --container-mounts=__OUTPUT_DIR__/install/url__commit_hash:/work,__OUTPUT_DIR__/install/repo__mcore_vfm_commit_hash:/opt/megatron-lm --no-container-mount-home bash -c "pwd ; ls"
diff --git a/tests/ref_data/ucc.sbatch b/tests/ref_data/ucc.sbatch
index a9f9e686..a3f5fca8 100644
--- a/tests/ref_data/ucc.sbatch
+++ b/tests/ref_data/ucc.sbatch
@@ -1,8 +1,8 @@
 #!/bin/bash
 #SBATCH --job-name=__JOB_NAME__
 #SBATCH -N 1
-#SBATCH --output=__OUTPUT_DIR__/stdout.txt
-#SBATCH --error=__OUTPUT_DIR__/stderr.txt
+#SBATCH --output=__OUTPUT_DIR__/output/stdout.txt
+#SBATCH --error=__OUTPUT_DIR__/output/stderr.txt
 #SBATCH --partition=main
 
 export SLURM_JOB_MASTER_NODE=$(scontrol show hostname $SLURM_JOB_NODELIST | head -n 1)
diff --git a/tests/test_acceptance.py b/tests/test_acceptance.py
index 38bbb39a..0711831b 100644
--- a/tests/test_acceptance.py
+++ b/tests/test_acceptance.py
@@ -255,7 +255,7 @@ def test_sbatch_generation(slurm_system: SlurmSystem, test_req: tuple[TestRun, s
 
     curr = Path(sbatch_script).read_text().strip()
     ref = (Path(__file__).parent / "ref_data" / test_req[1]).read_text().strip()
-    ref = ref.replace("__OUTPUT_DIR__", str(slurm_system.output_path)).replace("__JOB_NAME__", "job_name")
+    ref = ref.replace("__OUTPUT_DIR__", str(slurm_system.output_path.parent)).replace("__JOB_NAME__", "job_name")
 
     assert curr == ref
 

From c634ad1365b66aed4bea9a7d1fa9c4fba5d26199 Mon Sep 17 00:00:00 2001
From: Andrey Maslennikov <andreyma@nvidia.com>
Date: Mon, 11 Nov 2024 15:28:09 +0100
Subject: [PATCH 18/21] Make ruff happy

---
 .../test_template/slurm_container/slurm_command_gen_strategy.py  | 1 -
 1 file changed, 1 deletion(-)

diff --git a/src/cloudai/schema/test_template/slurm_container/slurm_command_gen_strategy.py b/src/cloudai/schema/test_template/slurm_container/slurm_command_gen_strategy.py
index 5863d06f..6763debc 100644
--- a/src/cloudai/schema/test_template/slurm_container/slurm_command_gen_strategy.py
+++ b/src/cloudai/schema/test_template/slurm_container/slurm_command_gen_strategy.py
@@ -14,7 +14,6 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-from pathlib import Path
 from typing import Any, cast
 
 from cloudai import TestRun

From 0c8a909e2085ab2d0f4aebefb71607c5d515496d Mon Sep 17 00:00:00 2001
From: Andrey Maslennikov <andreyma@nvidia.com>
Date: Mon, 11 Nov 2024 15:47:40 +0100
Subject: [PATCH 19/21] Make container mounts more configurable

---
 .../slurm_container/slurm_command_gen_strategy.py        | 6 +-----
 src/cloudai/test_definitions/slurm_container.py          | 9 +++++++++
 2 files changed, 10 insertions(+), 5 deletions(-)

diff --git a/src/cloudai/schema/test_template/slurm_container/slurm_command_gen_strategy.py b/src/cloudai/schema/test_template/slurm_container/slurm_command_gen_strategy.py
index 6763debc..23a22958 100644
--- a/src/cloudai/schema/test_template/slurm_container/slurm_command_gen_strategy.py
+++ b/src/cloudai/schema/test_template/slurm_container/slurm_command_gen_strategy.py
@@ -27,11 +27,7 @@ class SlurmContainerCommandGenStrategy(SlurmCommandGenStrategy):
     def gen_srun_prefix(self, slurm_args: dict[str, Any], tr: TestRun) -> list[str]:
         tdef: SlurmContainerTestDefinition = cast(SlurmContainerTestDefinition, tr.test.test_definition)
         slurm_args["image_path"] = tdef.docker_image.installed_path
-        repo_path = tdef.git_repo.installed_path or self.system.install_path / tdef.git_repo.repo_name
-        mcore_vfm_path = (
-            tdef.mcore_vfm_git_repo.installed_path or self.system.install_path / tdef.mcore_vfm_git_repo.repo_name
-        )
-        slurm_args["container_mounts"] = f"{repo_path.absolute()}:/work,{mcore_vfm_path.absolute()}:/opt/megatron-lm"
+        slurm_args["container_mounts"] = ",".join(tdef.container_mounts(self.system.install_path))
 
         cmd = super().gen_srun_prefix(slurm_args, tr)
         return cmd + ["--no-container-mount-home"]
diff --git a/src/cloudai/test_definitions/slurm_container.py b/src/cloudai/test_definitions/slurm_container.py
index f4ca6a2b..e84d3b47 100644
--- a/src/cloudai/test_definitions/slurm_container.py
+++ b/src/cloudai/test_definitions/slurm_container.py
@@ -14,6 +14,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
+from pathlib import Path
 from typing import Optional
 
 from cloudai import CmdArgs, Installable, TestDefinition
@@ -63,6 +64,14 @@ def mcore_vfm_git_repo(self) -> GitRepo:
 
         return self._mcore_git_repo
 
+    def container_mounts(self, root: Path) -> list[str]:
+        repo_path = self.git_repo.installed_path or root / self.git_repo.repo_name
+        mcore_vfm_path = self.mcore_vfm_git_repo.installed_path or root / self.mcore_vfm_git_repo.repo_name
+        return [
+            f"{repo_path.absolute()}:/work",
+            f"{mcore_vfm_path.absolute()}:/opt/megatron-lm",
+        ]
+
     @property
     def installables(self) -> list[Installable]:
         return [self.docker_image, self.git_repo, self.mcore_vfm_git_repo]

From 63973e356bf560e291a4ea9f0145e11928506657 Mon Sep 17 00:00:00 2001
From: Andrey Maslennikov <andreyma@nvidia.com>
Date: Wed, 13 Nov 2024 12:07:23 +0100
Subject: [PATCH 20/21] Remove config examples

---
 conf/new/test/nemo-vfm-mock_dit7b_65k.toml    | 36 ----------------
 conf/new/test/nemo-vfm-mock_dit7b_8k.toml     | 36 ----------------
 .../test/nemo-vfm-mock_ditllama28b_65k.toml   | 36 ----------------
 .../test/nemo-vfm-mock_ditllama28b_8k.toml    | 36 ----------------
 conf/new/test_scenario/nemo-vfm.toml          | 41 -------------------
 5 files changed, 185 deletions(-)
 delete mode 100644 conf/new/test/nemo-vfm-mock_dit7b_65k.toml
 delete mode 100644 conf/new/test/nemo-vfm-mock_dit7b_8k.toml
 delete mode 100644 conf/new/test/nemo-vfm-mock_ditllama28b_65k.toml
 delete mode 100644 conf/new/test/nemo-vfm-mock_ditllama28b_8k.toml
 delete mode 100644 conf/new/test_scenario/nemo-vfm.toml

diff --git a/conf/new/test/nemo-vfm-mock_dit7b_65k.toml b/conf/new/test/nemo-vfm-mock_dit7b_65k.toml
deleted file mode 100644
index e8fe485f..00000000
--- a/conf/new/test/nemo-vfm-mock_dit7b_65k.toml
+++ /dev/null
@@ -1,36 +0,0 @@
-# SPDX-FileCopyrightText: NVIDIA CORPORATION & AFFILIATES
-# Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
-# SPDX-License-Identifier: Apache-2.0
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-name = "nemo-vfm-mock_dit7b_65k"
-description = "Nemo VFM factory=mock_dit7b_65k"
-test_template_name = "SlurmContainer"
-
-[cmd_args]
-docker_image_url = "gitlab-master.nvidia.com/dl/nemo/nemo-vfm:24.10"
-repository_url = "ssh://git@gitlab-master.nvidia.com:12051/dl/nemo/nemo-vfm.git"
-repository_commit_hash = "98d67aaf64fa74811483a30abf354ba75b0ba416"               # benchmark branch
-mcore_vfm_repo = "ssh://git@gitlab-master.nvidia.com:12051/dl/nemo/mcore-vfm.git"
-mcore_vfm_commit_hash = "e14f0b211aec63c7e62385a686b455b6ce5c404d"                # fsdp branch
-
-[extra_cmd_args]
-"bash" = '-c "cd /work ; pwd ; ls ; git log -1 ; export PYTHONPATH=`pwd`:/opt/megatron-lm/:$PYTHONPATH ; python -u nemo/collections/diffusion/train.py --yes --factory mock_dit7b_65k trainer.strategy.tensor_model_parallel_size=1 trainer.strategy.sequence_parallel=False trainer.strategy.context_parallel_size=1 model.config.recompute_granularity=full model.config.recompute_method=uniform model.config.recompute_num_layers=1 data.micro_batch_size=1 data.global_batch_size=256 data.seq_length=65536 data.task_encoder.seq_length=65536 trainer.max_steps=10"'
-
-[extra_env_vars]
-"WANDB_PROJECT" = "vfm"
-"WANDB_RESUME" = "allow"
-"NVTE_FUSED_ATTN" = "0"
-"CUDA_DEVICE_MAX_CONNECTIONS" = "1"
-"PYTORCH_CUDA_ALLOC_CONF" = "expandable_segments:True"
diff --git a/conf/new/test/nemo-vfm-mock_dit7b_8k.toml b/conf/new/test/nemo-vfm-mock_dit7b_8k.toml
deleted file mode 100644
index 1d7bcd5b..00000000
--- a/conf/new/test/nemo-vfm-mock_dit7b_8k.toml
+++ /dev/null
@@ -1,36 +0,0 @@
-# SPDX-FileCopyrightText: NVIDIA CORPORATION & AFFILIATES
-# Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
-# SPDX-License-Identifier: Apache-2.0
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-name = "nemo-vfm-mock_dit7b_8k"
-description = "Nemo VFM factory=mock_dit7b_8k"
-test_template_name = "SlurmContainer"
-
-[cmd_args]
-docker_image_url = "gitlab-master.nvidia.com/dl/nemo/nemo-vfm:24.10"
-repository_url = "ssh://git@gitlab-master.nvidia.com:12051/dl/nemo/nemo-vfm.git"
-repository_commit_hash = "98d67aaf64fa74811483a30abf354ba75b0ba416"               # benchmark branch
-mcore_vfm_repo = "ssh://git@gitlab-master.nvidia.com:12051/dl/nemo/mcore-vfm.git"
-mcore_vfm_commit_hash = "e14f0b211aec63c7e62385a686b455b6ce5c404d"                # fsdp branch
-
-[extra_cmd_args]
-"bash" = '-c "cd /work ; pwd ; ls ; git log -1 ; export PYTHONPATH=`pwd`:/opt/megatron-lm/:$PYTHONPATH ; python -u nemo/collections/diffusion/train.py --yes --factory mock_dit7b_8k trainer.strategy.tensor_model_parallel_size=1 trainer.strategy.sequence_parallel=False trainer.strategy.context_parallel_size=1 model.config.recompute_granularity=full model.config.recompute_method=uniform model.config.recompute_num_layers=1 data.micro_batch_size=1 data.global_batch_size=256 data.seq_length=65536 data.task_encoder.seq_length=65536 trainer.max_steps=10"'
-
-[extra_env_vars]
-"WANDB_PROJECT" = "vfm"
-"WANDB_RESUME" = "allow"
-"NVTE_FUSED_ATTN" = "0"
-"CUDA_DEVICE_MAX_CONNECTIONS" = "1"
-"PYTORCH_CUDA_ALLOC_CONF" = "expandable_segments:True"
diff --git a/conf/new/test/nemo-vfm-mock_ditllama28b_65k.toml b/conf/new/test/nemo-vfm-mock_ditllama28b_65k.toml
deleted file mode 100644
index 6f78c14b..00000000
--- a/conf/new/test/nemo-vfm-mock_ditllama28b_65k.toml
+++ /dev/null
@@ -1,36 +0,0 @@
-# SPDX-FileCopyrightText: NVIDIA CORPORATION & AFFILIATES
-# Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
-# SPDX-License-Identifier: Apache-2.0
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-name = "nemo-vfm-mock_ditllama28b_65k"
-description = "Nemo VFM factory=mock_ditllama28b_65k"
-test_template_name = "SlurmContainer"
-
-[cmd_args]
-docker_image_url = "gitlab-master.nvidia.com/dl/nemo/nemo-vfm:24.10"
-repository_url = "ssh://git@gitlab-master.nvidia.com:12051/dl/nemo/nemo-vfm.git"
-repository_commit_hash = "98d67aaf64fa74811483a30abf354ba75b0ba416"               # benchmark branch
-mcore_vfm_repo = "ssh://git@gitlab-master.nvidia.com:12051/dl/nemo/mcore-vfm.git"
-mcore_vfm_commit_hash = "e14f0b211aec63c7e62385a686b455b6ce5c404d"                # fsdp branch
-
-[extra_cmd_args]
-"bash" = '-c "cd /work ; pwd ; ls ; git log -1 ; export PYTHONPATH=`pwd`:/opt/megatron-lm/:$PYTHONPATH ; python -u nemo/collections/diffusion/train.py --yes --factory mock_ditllama28b_65k trainer.strategy.tensor_model_parallel_size=1 trainer.strategy.sequence_parallel=False trainer.strategy.context_parallel_size=1 model.config.recompute_granularity=full model.config.recompute_method=uniform model.config.recompute_num_layers=1 data.micro_batch_size=1 data.global_batch_size=256 data.seq_length=65536 data.task_encoder.seq_length=65536 trainer.max_steps=10"'
-
-[extra_env_vars]
-"WANDB_PROJECT" = "vfm"
-"WANDB_RESUME" = "allow"
-"NVTE_FUSED_ATTN" = "0"
-"CUDA_DEVICE_MAX_CONNECTIONS" = "1"
-"PYTORCH_CUDA_ALLOC_CONF" = "expandable_segments:True"
diff --git a/conf/new/test/nemo-vfm-mock_ditllama28b_8k.toml b/conf/new/test/nemo-vfm-mock_ditllama28b_8k.toml
deleted file mode 100644
index 2083ad69..00000000
--- a/conf/new/test/nemo-vfm-mock_ditllama28b_8k.toml
+++ /dev/null
@@ -1,36 +0,0 @@
-# SPDX-FileCopyrightText: NVIDIA CORPORATION & AFFILIATES
-# Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
-# SPDX-License-Identifier: Apache-2.0
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-name = "nemo-vfm-mock_ditllama28b_8k"
-description = "Nemo VFM factory=mock_ditllama28b_8k"
-test_template_name = "SlurmContainer"
-
-[cmd_args]
-docker_image_url = "gitlab-master.nvidia.com/dl/nemo/nemo-vfm:24.10"
-repository_url = "ssh://git@gitlab-master.nvidia.com:12051/dl/nemo/nemo-vfm.git"
-repository_commit_hash = "98d67aaf64fa74811483a30abf354ba75b0ba416"               # benchmark branch
-mcore_vfm_repo = "ssh://git@gitlab-master.nvidia.com:12051/dl/nemo/mcore-vfm.git"
-mcore_vfm_commit_hash = "e14f0b211aec63c7e62385a686b455b6ce5c404d"                # fsdp branch
-
-[extra_cmd_args]
-"bash" = '-c "cd /work ; pwd ; ls ; git log -1 ; export PYTHONPATH=`pwd`:/opt/megatron-lm/:$PYTHONPATH ; python -u nemo/collections/diffusion/train.py --yes --factory mock_ditllama28b_8k trainer.strategy.tensor_model_parallel_size=1 trainer.strategy.sequence_parallel=False trainer.strategy.context_parallel_size=1 model.config.recompute_granularity=full model.config.recompute_method=uniform model.config.recompute_num_layers=1 data.micro_batch_size=1 data.global_batch_size=256 data.seq_length=65536 data.task_encoder.seq_length=65536 trainer.max_steps=10"'
-
-[extra_env_vars]
-"WANDB_PROJECT" = "vfm"
-"WANDB_RESUME" = "allow"
-"NVTE_FUSED_ATTN" = "0"
-"CUDA_DEVICE_MAX_CONNECTIONS" = "1"
-"PYTORCH_CUDA_ALLOC_CONF" = "expandable_segments:True"
diff --git a/conf/new/test_scenario/nemo-vfm.toml b/conf/new/test_scenario/nemo-vfm.toml
deleted file mode 100644
index 1a695dbb..00000000
--- a/conf/new/test_scenario/nemo-vfm.toml
+++ /dev/null
@@ -1,41 +0,0 @@
-# SPDX-FileCopyrightText: NVIDIA CORPORATION & AFFILIATES
-# Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
-# SPDX-License-Identifier: Apache-2.0
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-name = "nemo-vfm"
-
-[[Tests]]
-id = "Tests.mock.dit7b_8k"
-test_name = "nemo-vfm-mock_dit7b_8k"
-num_nodes = 8
-time_limit = "01:00:00"
-
-[[Tests]]
-id = "Tests.mock.dit7b_65k"
-test_name = "nemo-vfm-mock_dit7b_65k"
-num_nodes = 8
-time_limit = "01:00:00"
-
-[[Tests]]
-id = "Tests.mock.ditllama28b_8k"
-test_name = "nemo-vfm-mock_ditllama28b_8k"
-num_nodes = 8
-time_limit = "01:00:00"
-
-[[Tests]]
-id = "Tests.mock.ditllama28b_65k"
-test_name = "nemo-vfm-mock_ditllama28b_65k"
-num_nodes = 8
-time_limit = "01:00:00"

From 82d3d24778fb538d88405e96ae0047efd8aa0e42 Mon Sep 17 00:00:00 2001
From: Taekyung Heo <theo@nvidia.com>
Date: Fri, 15 Nov 2024 15:04:36 -0500
Subject: [PATCH 21/21] Use absolute path for NeMo launcher repository

---
 .../nemo_launcher/slurm_command_gen_strategy.py             | 6 +++++-
 1 file changed, 5 insertions(+), 1 deletion(-)

diff --git a/src/cloudai/schema/test_template/nemo_launcher/slurm_command_gen_strategy.py b/src/cloudai/schema/test_template/nemo_launcher/slurm_command_gen_strategy.py
index 2ca4392d..61e608c6 100644
--- a/src/cloudai/schema/test_template/nemo_launcher/slurm_command_gen_strategy.py
+++ b/src/cloudai/schema/test_template/nemo_launcher/slurm_command_gen_strategy.py
@@ -47,7 +47,11 @@ def gen_exec_command(self, tr: TestRun) -> str:
             )
         self.final_cmd_args["cluster.gpus_per_node"] = self.system.gpus_per_node or "null"
 
-        repo_path = tdef.python_executable.git_repo.installed_path
+        repo_path = (
+            tdef.python_executable.git_repo.installed_path.absolute()
+            if tdef.python_executable.git_repo.installed_path is not None
+            else None
+        )
         if not repo_path:
             logging.warning(
                 f"Local clone of git repo {tdef.python_executable.git_repo} does not exist. "