Add infrastructure for functional tests in CI pipeline and NCCL test …

…example
NVIDIA · May 15, 2024 · 9fe15c1 · 9fe15c1
1 parent eb50e83
commit 9fe15c1
Show file tree

Hide file tree

Showing 29 changed files with 1,182 additions and 0 deletions.
diff --git a/.github/workflows/functional_test.yml b/.github/workflows/functional_test.yml
@@ -0,0 +1,41 @@
+name: Functional Tests
+
+on: [push, pull_request]
+
+jobs:
+    test:
+        runs-on: ubuntu-latest
+        strategy:
+            matrix:
+                include:
+                    - scenario_path: "conf/v0.6/general/test_scenario/nccl_test/test_scenario.toml"
+                      expected_output_path: "ci_tools/functional_tests/scenarios_expected_outputs/nccl_test"
+                    # Add your new test here
+
+        steps:
+            - name: Checkout code
+              uses: actions/checkout@v4
+
+            - name: Set up Python
+              uses: actions/setup-python@v5
+              with:
+                  python-version: 3.9
+                  cache: 'pip'
+				  cache-dependency-path: |
+					**/requirements*.txt
+
+
+            - name: Install dependencies
+              run: |
+                  pip install -r requirements-dev.txt
+                  pip install requirements.txt
+                  pip install .
+
+
+            - name: Run tests
+              run: |
+                  bash ci_tools/functional_tests/run_functional_test.sh  ${{ matrix.scenario_path }} ${{ matrix.expected_output_path }}
+                  if [ $? -ne 0 ]; then
+                  echo "Test ${{ matrix.scenario_path }} failed"
+                  exit 1
+                  fi
diff --git a/ci_tools/functional_tests/README.md b/ci_tools/functional_tests/README.md
@@ -0,0 +1,20 @@
+The functional test use the dry-run mode and compare the output with an expected output.
+
+### The test
+The test receives two parameters:
+- The path to the scenario that need to be executed
+- The path to the expected output directory, this directory should be the same as the result directory written by the dry-run execution of the scenario
+
+It execute the scenario in dry-run mode and compare the results folder with the expected output folder, if any difference is found in one of the files, the test fails.
+**Note that the empty lines and commented lines (the ones that start with #) are not taken into account when computing the difference.**
+
+### The workflow
+The workflow uses a strategy matrix to define the tests it should run, every tests run in parallel.
+Note that the python setup is cached and therefore will run only if the requirements-dev.txt file is changed.
+
+### Add a new test
+To add a new test:
+- Create the desired scenario
+- Create a folder under `ci_tools/functional_tests/scenarios_expected_outputs/` named after the name of this test and fill it with the expected output of the scenario (you can copy the output of a valid dry run)
+- In the pipeline file (`.github/workflows/functional_test.yml`), add your new test in `jobs.test.strategy.matrix.include` (see the comment saying "Add your new test here"), follow the syntax of the other tests
+
diff --git a/ci_tools/functional_tests/run_functional_test.sh b/ci_tools/functional_tests/run_functional_test.sh
@@ -0,0 +1,91 @@
+#!/bin/bash
+
+VERBOSE=true
+
+if [ $# -ne 2 ]; then
+    echo "Usage: $0 <test_scenario_path> <expected_output_path>"
+    echo "test_scenario_path: the path of the test scenario"
+    echo "expected_output_path: the path of the directory containing the expected result of the test"
+    echo "Example: $0 conf/v0.6/general/test_scenario/nccl_test/test_scenario.toml ci_tools/functional_tests/scenarios_expected_outputs/nccl_test/"
+    exit 1
+fi
+
+files_diff() {
+    local file1="$1"
+    local file2="$2"
+    # Ignore:
+	# - Empty lines
+	# - Commented lines (start with #)
+    diff <(grep -v '^\s*#' <(grep -v '^[[:space:]]*$' "$file1")) <(grep -v '^\s*#' <(grep -v '^[[:space:]]*$' "$file2")) > /dev/null
+}
+
+# recursively compare directories using files_diff
+dirs_diff() {
+    local error=false
+    local dir1="$1"
+    local dir2="$2"
+
+    local files1=$(find "$dir1" -type f)
+    local files2=$(find "$dir2" -type f)
+
+    if [ "$(echo "$files1" | wc -l)" -ne "$(echo "$files2" | wc -l)" ]; then
+        >&2 echo "Directories have different count of files."
+        error=true
+    fi
+
+    while IFS= read -r file1; do
+        local file2=$(echo "$file1" | sed "s|^$dir1|$dir2|")
+        if [ ! -f "$file2" ]; then
+            >&2 echo "File $file2 does not exist in $dir2."
+			error=true
+        fi
+
+        if ! files_diff "$file1" "$file2"; then
+            >&2 echo "Files $file1 and $file2 have different contents."
+            error=true
+        fi
+    done <<< "$files1"
+
+    if $error; then
+        return 1
+    fi
+}
+
+
+scenario_path="$1"
+expected_output_path="$2"
+
+if [ ! -f "$scenario_path" ]; then
+    >&2 echo "Error: Scenario $scenario is not valid, can't find path $scenario_path."
+    exit 1
+fi
+
+[ ! -d "results" ] && mkdir results
+
+last_result_before=$(ls results/ -la -X | tail -n 3 | head -n 1 | awk '{print $NF}')
+
+python main.py \
+    --mode dry-run\
+    --system_config_path "ci_tools/functional_tests/system_config.toml" \
+    --test_scenario_path $scenario_path
+
+last_result=$(ls results/ -la -X | tail -n 3 | head -n 1 | awk '{print $NF}')
+
+if [ "$last_result_before" == "$last_result" ]; then
+    >&2 echo "No new result added after running cloudai dry run."
+    exit 1
+fi
+
+last_result_path="results/$last_result"
+
+dirs_diff "$expected_output_path" "$last_result_path"
+is_diff=$?
+
+if [ $is_diff -eq 1 ]; then
+    >&2 echo "Result output is not as expected."
+    exit 1
+fi
+
+$VERBOSE && echo "Test ran successfully"
+
+exit 0
diff --git a/ci_tools/functional_tests/scenarios_expected_outputs/nccl_test/Tests.1/0/sbatch_script.sh b/ci_tools/functional_tests/scenarios_expected_outputs/nccl_test/Tests.1/0/sbatch_script.sh
@@ -0,0 +1,39 @@
+#!/bin/bash
+#SBATCH --job-name=all_reduce_perf_mpi_20240515_120240
+#SBATCH -N 2
+#SBATCH --output=/.autodirect/mtrsysgwork/eshukrun/cloudai/results/2024-05-15_12-02-40/Tests.1/0/stdout.txt
+#SBATCH --error=/.autodirect/mtrsysgwork/eshukrun/cloudai/results/2024-05-15_12-02-40/Tests.1/0/stderr.txt
+#SBATCH --partition=partition_1
+#SBATCH --gpus-per-node=8
+#SBATCH --ntasks-per-node=8
+#SBATCH --time=00:20:00
+
+export SLURM_JOB_MASTER_NODE=$(scontrol show hostname $SLURM_JOB_NODELIST | head -n 1)
+
+export CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7
+export MELLANOX_VISIBLE_DEVICES=0,3,4,5,6,9,10,11
+export NCCL_IB_GID_INDEX=3
+export NCCL_IB_QPS_PER_CONNECTION=4
+export NCCL_IB_TIMEOUT=20
+
+srun \
+--mpi=pmix \
+--container-image=/path/to/install/nccl-test/nccl_test.sqsh \
+/usr/local/bin/all_reduce_perf_mpi \
+--nthreads 1 \
+--ngpus 1 \
+--minbytes 128 \
+--maxbytes 16G \
+--stepbytes 1M \
+--op sum \
+--datatype float \
+--root 0 \
+--iters 100 \
+--warmup_iters 50 \
+--agg_iters 1 \
+--average 1 \
+--parallel_init 0 \
+--check 1 \
+--blocking 0 \
+--cudagraph 0 \
+--stepfactor 2
diff --git a/ci_tools/functional_tests/scenarios_expected_outputs/nccl_test/Tests.10/0/sbatch_script.sh b/ci_tools/functional_tests/scenarios_expected_outputs/nccl_test/Tests.10/0/sbatch_script.sh
@@ -0,0 +1,39 @@
+#!/bin/bash
+#SBATCH --job-name=bisection_perf_mpi_20240515_120248
+#SBATCH -N 2
+#SBATCH --output=/.autodirect/mtrsysgwork/eshukrun/cloudai/results/2024-05-15_12-02-40/Tests.10/0/stdout.txt
+#SBATCH --error=/.autodirect/mtrsysgwork/eshukrun/cloudai/results/2024-05-15_12-02-40/Tests.10/0/stderr.txt
+#SBATCH --partition=partition_1
+#SBATCH --gpus-per-node=8
+#SBATCH --ntasks-per-node=8
+#SBATCH --time=00:20:00
+
+export SLURM_JOB_MASTER_NODE=$(scontrol show hostname $SLURM_JOB_NODELIST | head -n 1)
+
+export CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7
+export MELLANOX_VISIBLE_DEVICES=0,3,4,5,6,9,10,11
+export NCCL_IB_GID_INDEX=3
+export NCCL_IB_QPS_PER_CONNECTION=4
+export NCCL_IB_TIMEOUT=20
+
+srun \
+--mpi=pmix \
+--container-image=/path/to/install/nccl-test/nccl_test.sqsh \
+/usr/local/bin/bisection_perf_mpi \
+--nthreads 1 \
+--ngpus 1 \
+--minbytes 128 \
+--maxbytes 4G \
+--stepbytes 1M \
+--op sum \
+--datatype float \
+--root 0 \
+--iters 100 \
+--warmup_iters 50 \
+--agg_iters 1 \
+--average 1 \
+--parallel_init 0 \
+--check 1 \
+--blocking 0 \
+--cudagraph 0 \
+--stepfactor 2
diff --git a/ci_tools/functional_tests/scenarios_expected_outputs/nccl_test/Tests.11/0/sbatch_script.sh b/ci_tools/functional_tests/scenarios_expected_outputs/nccl_test/Tests.11/0/sbatch_script.sh
@@ -0,0 +1,39 @@
+#!/bin/bash
+#SBATCH --job-name=all_reduce_perf_mpi_20240515_120249
+#SBATCH -N 2
+#SBATCH --output=/.autodirect/mtrsysgwork/eshukrun/cloudai/results/2024-05-15_12-02-40/Tests.11/0/stdout.txt
+#SBATCH --error=/.autodirect/mtrsysgwork/eshukrun/cloudai/results/2024-05-15_12-02-40/Tests.11/0/stderr.txt
+#SBATCH --partition=partition_1
+#SBATCH --gpus-per-node=8
+#SBATCH --ntasks-per-node=8
+#SBATCH --time=00:20:00
+
+export SLURM_JOB_MASTER_NODE=$(scontrol show hostname $SLURM_JOB_NODELIST | head -n 1)
+
+export CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7
+export MELLANOX_VISIBLE_DEVICES=0,3,4,5,6,9,10,11
+export NCCL_IB_GID_INDEX=3
+export NCCL_IB_QPS_PER_CONNECTION=4
+export NCCL_IB_TIMEOUT=20
+
+srun \
+--mpi=pmix \
+--container-image=/path/to/install/nccl-test/nccl_test.sqsh \
+/usr/local/bin/all_reduce_perf_mpi \
+--nthreads 1 \
+--ngpus 1 \
+--minbytes 128 \
+--maxbytes 16G \
+--stepbytes 1M \
+--op sum \
+--datatype float \
+--root 0 \
+--iters 100 \
+--warmup_iters 50 \
+--agg_iters 1 \
+--average 1 \
+--parallel_init 0 \
+--check 1 \
+--blocking 0 \
+--cudagraph 0 \
+--stepfactor 2
diff --git a/ci_tools/functional_tests/scenarios_expected_outputs/nccl_test/Tests.12/0/sbatch_script.sh b/ci_tools/functional_tests/scenarios_expected_outputs/nccl_test/Tests.12/0/sbatch_script.sh
@@ -0,0 +1,40 @@
+#!/bin/bash
+#SBATCH --job-name=all_gather_perf_mpi_20240515_120250
+#SBATCH -N 2
+#SBATCH --output=/.autodirect/mtrsysgwork/eshukrun/cloudai/results/2024-05-15_12-02-40/Tests.12/0/stdout.txt
+#SBATCH --error=/.autodirect/mtrsysgwork/eshukrun/cloudai/results/2024-05-15_12-02-40/Tests.12/0/stderr.txt
+#SBATCH --partition=partition_1
+#SBATCH --gpus-per-node=8
+#SBATCH --ntasks-per-node=8
+#SBATCH --time=00:20:00
+
+export SLURM_JOB_MASTER_NODE=$(scontrol show hostname $SLURM_JOB_NODELIST | head -n 1)
+
+export CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7
+export MELLANOX_VISIBLE_DEVICES=0,3,4,5,6,9,10,11
+export NCCL_IB_GID_INDEX=3
+export NCCL_IB_QPS_PER_CONNECTION=4
+export NCCL_IB_TIMEOUT=20
+export NCCL_TEST_SPLIT_MASK=0x7
+
+srun \
+--mpi=pmix \
+--container-image=/path/to/install/nccl-test/nccl_test.sqsh \
+/usr/local/bin/all_gather_perf_mpi \
+--nthreads 1 \
+--ngpus 1 \
+--minbytes 128 \
+--maxbytes 4G \
+--stepbytes 1M \
+--op sum \
+--datatype float \
+--root 0 \
+--iters 100 \
+--warmup_iters 50 \
+--agg_iters 1 \
+--average 1 \
+--parallel_init 0 \
+--check 1 \
+--blocking 0 \
+--cudagraph 0 \
+--stepfactor 2
diff --git a/ci_tools/functional_tests/scenarios_expected_outputs/nccl_test/Tests.13/0/sbatch_script.sh b/ci_tools/functional_tests/scenarios_expected_outputs/nccl_test/Tests.13/0/sbatch_script.sh
@@ -0,0 +1,40 @@
+#!/bin/bash
+#SBATCH --job-name=reduce_scatter_perf_mpi_20240515_120251
+#SBATCH -N 2
+#SBATCH --output=/.autodirect/mtrsysgwork/eshukrun/cloudai/results/2024-05-15_12-02-40/Tests.13/0/stdout.txt
+#SBATCH --error=/.autodirect/mtrsysgwork/eshukrun/cloudai/results/2024-05-15_12-02-40/Tests.13/0/stderr.txt
+#SBATCH --partition=partition_1
+#SBATCH --gpus-per-node=8
+#SBATCH --ntasks-per-node=8
+#SBATCH --time=00:20:00
+
+export SLURM_JOB_MASTER_NODE=$(scontrol show hostname $SLURM_JOB_NODELIST | head -n 1)
+
+export CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7
+export MELLANOX_VISIBLE_DEVICES=0,3,4,5,6,9,10,11
+export NCCL_IB_GID_INDEX=3
+export NCCL_IB_QPS_PER_CONNECTION=4
+export NCCL_IB_TIMEOUT=20
+export NCCL_TEST_SPLIT_MASK=0x7
+
+srun \
+--mpi=pmix \
+--container-image=/path/to/install/nccl-test/nccl_test.sqsh \
+/usr/local/bin/reduce_scatter_perf_mpi \
+--nthreads 1 \
+--ngpus 1 \
+--minbytes 128 \
+--maxbytes 4G \
+--stepbytes 1M \
+--op sum \
+--datatype float \
+--root 0 \
+--iters 100 \
+--warmup_iters 50 \
+--agg_iters 1 \
+--average 1 \
+--parallel_init 0 \
+--check 1 \
+--blocking 0 \
+--cudagraph 0 \
+--stepfactor 2