diff --git a/.github/workflows/functional_test.yml b/.github/workflows/functional_test.yml new file mode 100644 index 000000000..4b98f49ef --- /dev/null +++ b/.github/workflows/functional_test.yml @@ -0,0 +1,41 @@ +name: Functional Tests + +on: [push, pull_request] + +jobs: + test: + runs-on: ubuntu-latest + strategy: + matrix: + include: + - scenario_path: "conf/v0.6/general/test_scenario/nccl_test/test_scenario.toml" + expected_output_path: "ci_tools/functional_tests/scenarios_expected_outputs/nccl_test" + # Add your new test here + + steps: + - name: Checkout code + uses: actions/checkout@v4 + + - name: Set up Python + uses: actions/setup-python@v5 + with: + python-version: 3.9 + cache: 'pip' + cache-dependency-path: | + **/requirements*.txt + + + - name: Install dependencies + run: | + pip install -r requirements-dev.txt + pip install requirements.txt + pip install . + + + - name: Run tests + run: | + bash ci_tools/functional_tests/run_functional_test.sh ${{ matrix.scenario_path }} ${{ matrix.expected_output_path }} + if [ $? -ne 0 ]; then + echo "Test ${{ matrix.scenario_path }} failed" + exit 1 + fi diff --git a/ci_tools/functional_tests/README.md b/ci_tools/functional_tests/README.md new file mode 100644 index 000000000..09b48c526 --- /dev/null +++ b/ci_tools/functional_tests/README.md @@ -0,0 +1,20 @@ +The functional test use the dry-run mode and compare the output with an expected output. + +### The test +The test receives two parameters: +- The path to the scenario that need to be executed +- The path to the expected output directory, this directory should be the same as the result directory written by the dry-run execution of the scenario + +It execute the scenario in dry-run mode and compare the results folder with the expected output folder, if any difference is found in one of the files, the test fails. +**Note that the empty lines and commented lines (the ones that start with #) are not taken into account when computing the difference.** + +### The workflow +The workflow uses a strategy matrix to define the tests it should run, every tests run in parallel. +Note that the python setup is cached and therefore will run only if the requirements-dev.txt file is changed. + +### Add a new test +To add a new test: +- Create the desired scenario +- Create a folder under `ci_tools/functional_tests/scenarios_expected_outputs/` named after the name of this test and fill it with the expected output of the scenario (you can copy the output of a valid dry run) +- In the pipeline file (`.github/workflows/functional_test.yml`), add your new test in `jobs.test.strategy.matrix.include` (see the comment saying "Add your new test here"), follow the syntax of the other tests + diff --git a/ci_tools/functional_tests/run_functional_test.sh b/ci_tools/functional_tests/run_functional_test.sh new file mode 100644 index 000000000..6c0a4525a --- /dev/null +++ b/ci_tools/functional_tests/run_functional_test.sh @@ -0,0 +1,91 @@ +#!/bin/bash + +VERBOSE=true + +if [ $# -ne 2 ]; then + echo "Usage: $0 " + echo "test_scenario_path: the path of the test scenario" + echo "expected_output_path: the path of the directory containing the expected result of the test" + echo "Example: $0 conf/v0.6/general/test_scenario/nccl_test/test_scenario.toml ci_tools/functional_tests/scenarios_expected_outputs/nccl_test/" + exit 1 +fi + +files_diff() { + local file1="$1" + local file2="$2" + # Ignore: + # - Empty lines + # - Commented lines (start with #) + diff <(grep -v '^\s*#' <(grep -v '^[[:space:]]*$' "$file1")) <(grep -v '^\s*#' <(grep -v '^[[:space:]]*$' "$file2")) > /dev/null +} + +# recursively compare directories using files_diff +dirs_diff() { + local error=false + local dir1="$1" + local dir2="$2" + + local files1=$(find "$dir1" -type f) + local files2=$(find "$dir2" -type f) + + if [ "$(echo "$files1" | wc -l)" -ne "$(echo "$files2" | wc -l)" ]; then + >&2 echo "Directories have different count of files." + error=true + fi + + while IFS= read -r file1; do + local file2=$(echo "$file1" | sed "s|^$dir1|$dir2|") + if [ ! -f "$file2" ]; then + >&2 echo "File $file2 does not exist in $dir2." + error=true + fi + + if ! files_diff "$file1" "$file2"; then + >&2 echo "Files $file1 and $file2 have different contents." + error=true + fi + done <<< "$files1" + + if $error; then + return 1 + fi +} + + +scenario_path="$1" +expected_output_path="$2" + +if [ ! -f "$scenario_path" ]; then + >&2 echo "Error: Scenario $scenario is not valid, can't find path $scenario_path." + exit 1 +fi + +[ ! -d "results" ] && mkdir results + +last_result_before=$(ls results/ -la -X | tail -n 3 | head -n 1 | awk '{print $NF}') + +python main.py \ + --mode dry-run\ + --system_config_path "ci_tools/functional_tests/system_config.toml" \ + --test_scenario_path $scenario_path + +last_result=$(ls results/ -la -X | tail -n 3 | head -n 1 | awk '{print $NF}') + +if [ "$last_result_before" == "$last_result" ]; then + >&2 echo "No new result added after running cloudai dry run." + exit 1 +fi + +last_result_path="results/$last_result" + +dirs_diff "$expected_output_path" "$last_result_path" +is_diff=$? + +if [ $is_diff -eq 1 ]; then + >&2 echo "Result output is not as expected." + exit 1 +fi + +$VERBOSE && echo "Test ran successfully" + +exit 0 diff --git a/ci_tools/functional_tests/scenarios_expected_outputs/nccl_test/Tests.1/0/sbatch_script.sh b/ci_tools/functional_tests/scenarios_expected_outputs/nccl_test/Tests.1/0/sbatch_script.sh new file mode 100644 index 000000000..5c81df247 --- /dev/null +++ b/ci_tools/functional_tests/scenarios_expected_outputs/nccl_test/Tests.1/0/sbatch_script.sh @@ -0,0 +1,39 @@ +#!/bin/bash +#SBATCH --job-name=all_reduce_perf_mpi_20240515_120240 +#SBATCH -N 2 +#SBATCH --output=/.autodirect/mtrsysgwork/eshukrun/cloudai/results/2024-05-15_12-02-40/Tests.1/0/stdout.txt +#SBATCH --error=/.autodirect/mtrsysgwork/eshukrun/cloudai/results/2024-05-15_12-02-40/Tests.1/0/stderr.txt +#SBATCH --partition=partition_1 +#SBATCH --gpus-per-node=8 +#SBATCH --ntasks-per-node=8 +#SBATCH --time=00:20:00 + +export SLURM_JOB_MASTER_NODE=$(scontrol show hostname $SLURM_JOB_NODELIST | head -n 1) + +export CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7 +export MELLANOX_VISIBLE_DEVICES=0,3,4,5,6,9,10,11 +export NCCL_IB_GID_INDEX=3 +export NCCL_IB_QPS_PER_CONNECTION=4 +export NCCL_IB_TIMEOUT=20 + +srun \ +--mpi=pmix \ +--container-image=/path/to/install/nccl-test/nccl_test.sqsh \ +/usr/local/bin/all_reduce_perf_mpi \ +--nthreads 1 \ +--ngpus 1 \ +--minbytes 128 \ +--maxbytes 16G \ +--stepbytes 1M \ +--op sum \ +--datatype float \ +--root 0 \ +--iters 100 \ +--warmup_iters 50 \ +--agg_iters 1 \ +--average 1 \ +--parallel_init 0 \ +--check 1 \ +--blocking 0 \ +--cudagraph 0 \ +--stepfactor 2 \ No newline at end of file diff --git a/ci_tools/functional_tests/scenarios_expected_outputs/nccl_test/Tests.10/0/sbatch_script.sh b/ci_tools/functional_tests/scenarios_expected_outputs/nccl_test/Tests.10/0/sbatch_script.sh new file mode 100644 index 000000000..0b095811b --- /dev/null +++ b/ci_tools/functional_tests/scenarios_expected_outputs/nccl_test/Tests.10/0/sbatch_script.sh @@ -0,0 +1,39 @@ +#!/bin/bash +#SBATCH --job-name=bisection_perf_mpi_20240515_120248 +#SBATCH -N 2 +#SBATCH --output=/.autodirect/mtrsysgwork/eshukrun/cloudai/results/2024-05-15_12-02-40/Tests.10/0/stdout.txt +#SBATCH --error=/.autodirect/mtrsysgwork/eshukrun/cloudai/results/2024-05-15_12-02-40/Tests.10/0/stderr.txt +#SBATCH --partition=partition_1 +#SBATCH --gpus-per-node=8 +#SBATCH --ntasks-per-node=8 +#SBATCH --time=00:20:00 + +export SLURM_JOB_MASTER_NODE=$(scontrol show hostname $SLURM_JOB_NODELIST | head -n 1) + +export CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7 +export MELLANOX_VISIBLE_DEVICES=0,3,4,5,6,9,10,11 +export NCCL_IB_GID_INDEX=3 +export NCCL_IB_QPS_PER_CONNECTION=4 +export NCCL_IB_TIMEOUT=20 + +srun \ +--mpi=pmix \ +--container-image=/path/to/install/nccl-test/nccl_test.sqsh \ +/usr/local/bin/bisection_perf_mpi \ +--nthreads 1 \ +--ngpus 1 \ +--minbytes 128 \ +--maxbytes 4G \ +--stepbytes 1M \ +--op sum \ +--datatype float \ +--root 0 \ +--iters 100 \ +--warmup_iters 50 \ +--agg_iters 1 \ +--average 1 \ +--parallel_init 0 \ +--check 1 \ +--blocking 0 \ +--cudagraph 0 \ +--stepfactor 2 \ No newline at end of file diff --git a/ci_tools/functional_tests/scenarios_expected_outputs/nccl_test/Tests.11/0/sbatch_script.sh b/ci_tools/functional_tests/scenarios_expected_outputs/nccl_test/Tests.11/0/sbatch_script.sh new file mode 100644 index 000000000..63cb23975 --- /dev/null +++ b/ci_tools/functional_tests/scenarios_expected_outputs/nccl_test/Tests.11/0/sbatch_script.sh @@ -0,0 +1,39 @@ +#!/bin/bash +#SBATCH --job-name=all_reduce_perf_mpi_20240515_120249 +#SBATCH -N 2 +#SBATCH --output=/.autodirect/mtrsysgwork/eshukrun/cloudai/results/2024-05-15_12-02-40/Tests.11/0/stdout.txt +#SBATCH --error=/.autodirect/mtrsysgwork/eshukrun/cloudai/results/2024-05-15_12-02-40/Tests.11/0/stderr.txt +#SBATCH --partition=partition_1 +#SBATCH --gpus-per-node=8 +#SBATCH --ntasks-per-node=8 +#SBATCH --time=00:20:00 + +export SLURM_JOB_MASTER_NODE=$(scontrol show hostname $SLURM_JOB_NODELIST | head -n 1) + +export CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7 +export MELLANOX_VISIBLE_DEVICES=0,3,4,5,6,9,10,11 +export NCCL_IB_GID_INDEX=3 +export NCCL_IB_QPS_PER_CONNECTION=4 +export NCCL_IB_TIMEOUT=20 + +srun \ +--mpi=pmix \ +--container-image=/path/to/install/nccl-test/nccl_test.sqsh \ +/usr/local/bin/all_reduce_perf_mpi \ +--nthreads 1 \ +--ngpus 1 \ +--minbytes 128 \ +--maxbytes 16G \ +--stepbytes 1M \ +--op sum \ +--datatype float \ +--root 0 \ +--iters 100 \ +--warmup_iters 50 \ +--agg_iters 1 \ +--average 1 \ +--parallel_init 0 \ +--check 1 \ +--blocking 0 \ +--cudagraph 0 \ +--stepfactor 2 \ No newline at end of file diff --git a/ci_tools/functional_tests/scenarios_expected_outputs/nccl_test/Tests.12/0/sbatch_script.sh b/ci_tools/functional_tests/scenarios_expected_outputs/nccl_test/Tests.12/0/sbatch_script.sh new file mode 100644 index 000000000..a84611320 --- /dev/null +++ b/ci_tools/functional_tests/scenarios_expected_outputs/nccl_test/Tests.12/0/sbatch_script.sh @@ -0,0 +1,40 @@ +#!/bin/bash +#SBATCH --job-name=all_gather_perf_mpi_20240515_120250 +#SBATCH -N 2 +#SBATCH --output=/.autodirect/mtrsysgwork/eshukrun/cloudai/results/2024-05-15_12-02-40/Tests.12/0/stdout.txt +#SBATCH --error=/.autodirect/mtrsysgwork/eshukrun/cloudai/results/2024-05-15_12-02-40/Tests.12/0/stderr.txt +#SBATCH --partition=partition_1 +#SBATCH --gpus-per-node=8 +#SBATCH --ntasks-per-node=8 +#SBATCH --time=00:20:00 + +export SLURM_JOB_MASTER_NODE=$(scontrol show hostname $SLURM_JOB_NODELIST | head -n 1) + +export CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7 +export MELLANOX_VISIBLE_DEVICES=0,3,4,5,6,9,10,11 +export NCCL_IB_GID_INDEX=3 +export NCCL_IB_QPS_PER_CONNECTION=4 +export NCCL_IB_TIMEOUT=20 +export NCCL_TEST_SPLIT_MASK=0x7 + +srun \ +--mpi=pmix \ +--container-image=/path/to/install/nccl-test/nccl_test.sqsh \ +/usr/local/bin/all_gather_perf_mpi \ +--nthreads 1 \ +--ngpus 1 \ +--minbytes 128 \ +--maxbytes 4G \ +--stepbytes 1M \ +--op sum \ +--datatype float \ +--root 0 \ +--iters 100 \ +--warmup_iters 50 \ +--agg_iters 1 \ +--average 1 \ +--parallel_init 0 \ +--check 1 \ +--blocking 0 \ +--cudagraph 0 \ +--stepfactor 2 \ No newline at end of file diff --git a/ci_tools/functional_tests/scenarios_expected_outputs/nccl_test/Tests.13/0/sbatch_script.sh b/ci_tools/functional_tests/scenarios_expected_outputs/nccl_test/Tests.13/0/sbatch_script.sh new file mode 100644 index 000000000..756b8dd05 --- /dev/null +++ b/ci_tools/functional_tests/scenarios_expected_outputs/nccl_test/Tests.13/0/sbatch_script.sh @@ -0,0 +1,40 @@ +#!/bin/bash +#SBATCH --job-name=reduce_scatter_perf_mpi_20240515_120251 +#SBATCH -N 2 +#SBATCH --output=/.autodirect/mtrsysgwork/eshukrun/cloudai/results/2024-05-15_12-02-40/Tests.13/0/stdout.txt +#SBATCH --error=/.autodirect/mtrsysgwork/eshukrun/cloudai/results/2024-05-15_12-02-40/Tests.13/0/stderr.txt +#SBATCH --partition=partition_1 +#SBATCH --gpus-per-node=8 +#SBATCH --ntasks-per-node=8 +#SBATCH --time=00:20:00 + +export SLURM_JOB_MASTER_NODE=$(scontrol show hostname $SLURM_JOB_NODELIST | head -n 1) + +export CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7 +export MELLANOX_VISIBLE_DEVICES=0,3,4,5,6,9,10,11 +export NCCL_IB_GID_INDEX=3 +export NCCL_IB_QPS_PER_CONNECTION=4 +export NCCL_IB_TIMEOUT=20 +export NCCL_TEST_SPLIT_MASK=0x7 + +srun \ +--mpi=pmix \ +--container-image=/path/to/install/nccl-test/nccl_test.sqsh \ +/usr/local/bin/reduce_scatter_perf_mpi \ +--nthreads 1 \ +--ngpus 1 \ +--minbytes 128 \ +--maxbytes 4G \ +--stepbytes 1M \ +--op sum \ +--datatype float \ +--root 0 \ +--iters 100 \ +--warmup_iters 50 \ +--agg_iters 1 \ +--average 1 \ +--parallel_init 0 \ +--check 1 \ +--blocking 0 \ +--cudagraph 0 \ +--stepfactor 2 \ No newline at end of file diff --git a/ci_tools/functional_tests/scenarios_expected_outputs/nccl_test/Tests.14/0/sbatch_script.sh b/ci_tools/functional_tests/scenarios_expected_outputs/nccl_test/Tests.14/0/sbatch_script.sh new file mode 100644 index 000000000..7ee1980db --- /dev/null +++ b/ci_tools/functional_tests/scenarios_expected_outputs/nccl_test/Tests.14/0/sbatch_script.sh @@ -0,0 +1,39 @@ +#!/bin/bash +#SBATCH --job-name=alltoall_perf_mpi_20240515_120252 +#SBATCH -N 2 +#SBATCH --output=/.autodirect/mtrsysgwork/eshukrun/cloudai/results/2024-05-15_12-02-40/Tests.14/0/stdout.txt +#SBATCH --error=/.autodirect/mtrsysgwork/eshukrun/cloudai/results/2024-05-15_12-02-40/Tests.14/0/stderr.txt +#SBATCH --partition=partition_1 +#SBATCH --gpus-per-node=8 +#SBATCH --ntasks-per-node=8 +#SBATCH --time=00:20:00 + +export SLURM_JOB_MASTER_NODE=$(scontrol show hostname $SLURM_JOB_NODELIST | head -n 1) + +export CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7 +export MELLANOX_VISIBLE_DEVICES=0,3,4,5,6,9,10,11 +export NCCL_IB_GID_INDEX=3 +export NCCL_IB_QPS_PER_CONNECTION=4 +export NCCL_IB_TIMEOUT=20 + +srun \ +--mpi=pmix \ +--container-image=/path/to/install/nccl-test/nccl_test.sqsh \ +/usr/local/bin/alltoall_perf_mpi \ +--nthreads 1 \ +--ngpus 1 \ +--minbytes 128 \ +--maxbytes 4G \ +--stepbytes 1M \ +--op sum \ +--datatype float \ +--root 0 \ +--iters 100 \ +--warmup_iters 50 \ +--agg_iters 1 \ +--average 1 \ +--parallel_init 0 \ +--check 1 \ +--blocking 0 \ +--cudagraph 0 \ +--stepfactor 2 \ No newline at end of file diff --git a/ci_tools/functional_tests/scenarios_expected_outputs/nccl_test/Tests.15/0/sbatch_script.sh b/ci_tools/functional_tests/scenarios_expected_outputs/nccl_test/Tests.15/0/sbatch_script.sh new file mode 100644 index 000000000..a315d6eaa --- /dev/null +++ b/ci_tools/functional_tests/scenarios_expected_outputs/nccl_test/Tests.15/0/sbatch_script.sh @@ -0,0 +1,39 @@ +#!/bin/bash +#SBATCH --job-name=bisection_perf_mpi_20240515_120253 +#SBATCH -N 2 +#SBATCH --output=/.autodirect/mtrsysgwork/eshukrun/cloudai/results/2024-05-15_12-02-40/Tests.15/0/stdout.txt +#SBATCH --error=/.autodirect/mtrsysgwork/eshukrun/cloudai/results/2024-05-15_12-02-40/Tests.15/0/stderr.txt +#SBATCH --partition=partition_1 +#SBATCH --gpus-per-node=8 +#SBATCH --ntasks-per-node=8 +#SBATCH --time=00:20:00 + +export SLURM_JOB_MASTER_NODE=$(scontrol show hostname $SLURM_JOB_NODELIST | head -n 1) + +export CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7 +export MELLANOX_VISIBLE_DEVICES=0,3,4,5,6,9,10,11 +export NCCL_IB_GID_INDEX=3 +export NCCL_IB_QPS_PER_CONNECTION=4 +export NCCL_IB_TIMEOUT=20 + +srun \ +--mpi=pmix \ +--container-image=/path/to/install/nccl-test/nccl_test.sqsh \ +/usr/local/bin/bisection_perf_mpi \ +--nthreads 1 \ +--ngpus 1 \ +--minbytes 128 \ +--maxbytes 4G \ +--stepbytes 1M \ +--op sum \ +--datatype float \ +--root 0 \ +--iters 100 \ +--warmup_iters 50 \ +--agg_iters 1 \ +--average 1 \ +--parallel_init 0 \ +--check 1 \ +--blocking 0 \ +--cudagraph 0 \ +--stepfactor 2 \ No newline at end of file diff --git a/ci_tools/functional_tests/scenarios_expected_outputs/nccl_test/Tests.16/0/sbatch_script.sh b/ci_tools/functional_tests/scenarios_expected_outputs/nccl_test/Tests.16/0/sbatch_script.sh new file mode 100644 index 000000000..740b0a3fb --- /dev/null +++ b/ci_tools/functional_tests/scenarios_expected_outputs/nccl_test/Tests.16/0/sbatch_script.sh @@ -0,0 +1,39 @@ +#!/bin/bash +#SBATCH --job-name=all_reduce_perf_mpi_20240515_120254 +#SBATCH -N 2 +#SBATCH --output=/.autodirect/mtrsysgwork/eshukrun/cloudai/results/2024-05-15_12-02-40/Tests.16/0/stdout.txt +#SBATCH --error=/.autodirect/mtrsysgwork/eshukrun/cloudai/results/2024-05-15_12-02-40/Tests.16/0/stderr.txt +#SBATCH --partition=partition_1 +#SBATCH --gpus-per-node=8 +#SBATCH --ntasks-per-node=8 +#SBATCH --time=00:20:00 + +export SLURM_JOB_MASTER_NODE=$(scontrol show hostname $SLURM_JOB_NODELIST | head -n 1) + +export CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7 +export MELLANOX_VISIBLE_DEVICES=0,3,4,5,6,9,10,11 +export NCCL_IB_GID_INDEX=3 +export NCCL_IB_QPS_PER_CONNECTION=4 +export NCCL_IB_TIMEOUT=20 + +srun \ +--mpi=pmix \ +--container-image=/path/to/install/nccl-test/nccl_test.sqsh \ +/usr/local/bin/all_reduce_perf_mpi \ +--nthreads 1 \ +--ngpus 1 \ +--minbytes 128 \ +--maxbytes 16G \ +--stepbytes 1M \ +--op sum \ +--datatype float \ +--root 0 \ +--iters 100 \ +--warmup_iters 50 \ +--agg_iters 1 \ +--average 1 \ +--parallel_init 0 \ +--check 1 \ +--blocking 0 \ +--cudagraph 0 \ +--stepfactor 2 \ No newline at end of file diff --git a/ci_tools/functional_tests/scenarios_expected_outputs/nccl_test/Tests.17/0/sbatch_script.sh b/ci_tools/functional_tests/scenarios_expected_outputs/nccl_test/Tests.17/0/sbatch_script.sh new file mode 100644 index 000000000..d7846d3b7 --- /dev/null +++ b/ci_tools/functional_tests/scenarios_expected_outputs/nccl_test/Tests.17/0/sbatch_script.sh @@ -0,0 +1,40 @@ +#!/bin/bash +#SBATCH --job-name=all_gather_perf_mpi_20240515_120255 +#SBATCH -N 2 +#SBATCH --output=/.autodirect/mtrsysgwork/eshukrun/cloudai/results/2024-05-15_12-02-40/Tests.17/0/stdout.txt +#SBATCH --error=/.autodirect/mtrsysgwork/eshukrun/cloudai/results/2024-05-15_12-02-40/Tests.17/0/stderr.txt +#SBATCH --partition=partition_1 +#SBATCH --gpus-per-node=8 +#SBATCH --ntasks-per-node=8 +#SBATCH --time=00:20:00 + +export SLURM_JOB_MASTER_NODE=$(scontrol show hostname $SLURM_JOB_NODELIST | head -n 1) + +export CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7 +export MELLANOX_VISIBLE_DEVICES=0,3,4,5,6,9,10,11 +export NCCL_IB_GID_INDEX=3 +export NCCL_IB_QPS_PER_CONNECTION=4 +export NCCL_IB_TIMEOUT=20 +export NCCL_TEST_SPLIT_MASK=0x7 + +srun \ +--mpi=pmix \ +--container-image=/path/to/install/nccl-test/nccl_test.sqsh \ +/usr/local/bin/all_gather_perf_mpi \ +--nthreads 1 \ +--ngpus 1 \ +--minbytes 128 \ +--maxbytes 4G \ +--stepbytes 1M \ +--op sum \ +--datatype float \ +--root 0 \ +--iters 100 \ +--warmup_iters 50 \ +--agg_iters 1 \ +--average 1 \ +--parallel_init 0 \ +--check 1 \ +--blocking 0 \ +--cudagraph 0 \ +--stepfactor 2 \ No newline at end of file diff --git a/ci_tools/functional_tests/scenarios_expected_outputs/nccl_test/Tests.18/0/sbatch_script.sh b/ci_tools/functional_tests/scenarios_expected_outputs/nccl_test/Tests.18/0/sbatch_script.sh new file mode 100644 index 000000000..2b58d0fb9 --- /dev/null +++ b/ci_tools/functional_tests/scenarios_expected_outputs/nccl_test/Tests.18/0/sbatch_script.sh @@ -0,0 +1,40 @@ +#!/bin/bash +#SBATCH --job-name=reduce_scatter_perf_mpi_20240515_120256 +#SBATCH -N 2 +#SBATCH --output=/.autodirect/mtrsysgwork/eshukrun/cloudai/results/2024-05-15_12-02-40/Tests.18/0/stdout.txt +#SBATCH --error=/.autodirect/mtrsysgwork/eshukrun/cloudai/results/2024-05-15_12-02-40/Tests.18/0/stderr.txt +#SBATCH --partition=partition_1 +#SBATCH --gpus-per-node=8 +#SBATCH --ntasks-per-node=8 +#SBATCH --time=00:20:00 + +export SLURM_JOB_MASTER_NODE=$(scontrol show hostname $SLURM_JOB_NODELIST | head -n 1) + +export CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7 +export MELLANOX_VISIBLE_DEVICES=0,3,4,5,6,9,10,11 +export NCCL_IB_GID_INDEX=3 +export NCCL_IB_QPS_PER_CONNECTION=4 +export NCCL_IB_TIMEOUT=20 +export NCCL_TEST_SPLIT_MASK=0x7 + +srun \ +--mpi=pmix \ +--container-image=/path/to/install/nccl-test/nccl_test.sqsh \ +/usr/local/bin/reduce_scatter_perf_mpi \ +--nthreads 1 \ +--ngpus 1 \ +--minbytes 128 \ +--maxbytes 4G \ +--stepbytes 1M \ +--op sum \ +--datatype float \ +--root 0 \ +--iters 100 \ +--warmup_iters 50 \ +--agg_iters 1 \ +--average 1 \ +--parallel_init 0 \ +--check 1 \ +--blocking 0 \ +--cudagraph 0 \ +--stepfactor 2 \ No newline at end of file diff --git a/ci_tools/functional_tests/scenarios_expected_outputs/nccl_test/Tests.19/0/sbatch_script.sh b/ci_tools/functional_tests/scenarios_expected_outputs/nccl_test/Tests.19/0/sbatch_script.sh new file mode 100644 index 000000000..b0fcb21bf --- /dev/null +++ b/ci_tools/functional_tests/scenarios_expected_outputs/nccl_test/Tests.19/0/sbatch_script.sh @@ -0,0 +1,39 @@ +#!/bin/bash +#SBATCH --job-name=alltoall_perf_mpi_20240515_120257 +#SBATCH -N 2 +#SBATCH --output=/.autodirect/mtrsysgwork/eshukrun/cloudai/results/2024-05-15_12-02-40/Tests.19/0/stdout.txt +#SBATCH --error=/.autodirect/mtrsysgwork/eshukrun/cloudai/results/2024-05-15_12-02-40/Tests.19/0/stderr.txt +#SBATCH --partition=partition_1 +#SBATCH --gpus-per-node=8 +#SBATCH --ntasks-per-node=8 +#SBATCH --time=00:20:00 + +export SLURM_JOB_MASTER_NODE=$(scontrol show hostname $SLURM_JOB_NODELIST | head -n 1) + +export CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7 +export MELLANOX_VISIBLE_DEVICES=0,3,4,5,6,9,10,11 +export NCCL_IB_GID_INDEX=3 +export NCCL_IB_QPS_PER_CONNECTION=4 +export NCCL_IB_TIMEOUT=20 + +srun \ +--mpi=pmix \ +--container-image=/path/to/install/nccl-test/nccl_test.sqsh \ +/usr/local/bin/alltoall_perf_mpi \ +--nthreads 1 \ +--ngpus 1 \ +--minbytes 128 \ +--maxbytes 4G \ +--stepbytes 1M \ +--op sum \ +--datatype float \ +--root 0 \ +--iters 100 \ +--warmup_iters 50 \ +--agg_iters 1 \ +--average 1 \ +--parallel_init 0 \ +--check 1 \ +--blocking 0 \ +--cudagraph 0 \ +--stepfactor 2 \ No newline at end of file diff --git a/ci_tools/functional_tests/scenarios_expected_outputs/nccl_test/Tests.2/0/sbatch_script.sh b/ci_tools/functional_tests/scenarios_expected_outputs/nccl_test/Tests.2/0/sbatch_script.sh new file mode 100644 index 000000000..a4ccd6e54 --- /dev/null +++ b/ci_tools/functional_tests/scenarios_expected_outputs/nccl_test/Tests.2/0/sbatch_script.sh @@ -0,0 +1,40 @@ +#!/bin/bash +#SBATCH --job-name=all_gather_perf_mpi_20240515_120240 +#SBATCH -N 2 +#SBATCH --output=/.autodirect/mtrsysgwork/eshukrun/cloudai/results/2024-05-15_12-02-40/Tests.2/0/stdout.txt +#SBATCH --error=/.autodirect/mtrsysgwork/eshukrun/cloudai/results/2024-05-15_12-02-40/Tests.2/0/stderr.txt +#SBATCH --partition=partition_1 +#SBATCH --gpus-per-node=8 +#SBATCH --ntasks-per-node=8 +#SBATCH --time=00:20:00 + +export SLURM_JOB_MASTER_NODE=$(scontrol show hostname $SLURM_JOB_NODELIST | head -n 1) + +export CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7 +export MELLANOX_VISIBLE_DEVICES=0,3,4,5,6,9,10,11 +export NCCL_IB_GID_INDEX=3 +export NCCL_IB_QPS_PER_CONNECTION=4 +export NCCL_IB_TIMEOUT=20 +export NCCL_TEST_SPLIT_MASK=0x7 + +srun \ +--mpi=pmix \ +--container-image=/path/to/install/nccl-test/nccl_test.sqsh \ +/usr/local/bin/all_gather_perf_mpi \ +--nthreads 1 \ +--ngpus 1 \ +--minbytes 128 \ +--maxbytes 4G \ +--stepbytes 1M \ +--op sum \ +--datatype float \ +--root 0 \ +--iters 100 \ +--warmup_iters 50 \ +--agg_iters 1 \ +--average 1 \ +--parallel_init 0 \ +--check 1 \ +--blocking 0 \ +--cudagraph 0 \ +--stepfactor 2 \ No newline at end of file diff --git a/ci_tools/functional_tests/scenarios_expected_outputs/nccl_test/Tests.20/0/sbatch_script.sh b/ci_tools/functional_tests/scenarios_expected_outputs/nccl_test/Tests.20/0/sbatch_script.sh new file mode 100644 index 000000000..85a9b2679 --- /dev/null +++ b/ci_tools/functional_tests/scenarios_expected_outputs/nccl_test/Tests.20/0/sbatch_script.sh @@ -0,0 +1,39 @@ +#!/bin/bash +#SBATCH --job-name=bisection_perf_mpi_20240515_120259 +#SBATCH -N 2 +#SBATCH --output=/.autodirect/mtrsysgwork/eshukrun/cloudai/results/2024-05-15_12-02-40/Tests.20/0/stdout.txt +#SBATCH --error=/.autodirect/mtrsysgwork/eshukrun/cloudai/results/2024-05-15_12-02-40/Tests.20/0/stderr.txt +#SBATCH --partition=partition_1 +#SBATCH --gpus-per-node=8 +#SBATCH --ntasks-per-node=8 +#SBATCH --time=00:20:00 + +export SLURM_JOB_MASTER_NODE=$(scontrol show hostname $SLURM_JOB_NODELIST | head -n 1) + +export CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7 +export MELLANOX_VISIBLE_DEVICES=0,3,4,5,6,9,10,11 +export NCCL_IB_GID_INDEX=3 +export NCCL_IB_QPS_PER_CONNECTION=4 +export NCCL_IB_TIMEOUT=20 + +srun \ +--mpi=pmix \ +--container-image=/path/to/install/nccl-test/nccl_test.sqsh \ +/usr/local/bin/bisection_perf_mpi \ +--nthreads 1 \ +--ngpus 1 \ +--minbytes 128 \ +--maxbytes 4G \ +--stepbytes 1M \ +--op sum \ +--datatype float \ +--root 0 \ +--iters 100 \ +--warmup_iters 50 \ +--agg_iters 1 \ +--average 1 \ +--parallel_init 0 \ +--check 1 \ +--blocking 0 \ +--cudagraph 0 \ +--stepfactor 2 \ No newline at end of file diff --git a/ci_tools/functional_tests/scenarios_expected_outputs/nccl_test/Tests.21/0/sbatch_script.sh b/ci_tools/functional_tests/scenarios_expected_outputs/nccl_test/Tests.21/0/sbatch_script.sh new file mode 100644 index 000000000..2d41ad97c --- /dev/null +++ b/ci_tools/functional_tests/scenarios_expected_outputs/nccl_test/Tests.21/0/sbatch_script.sh @@ -0,0 +1,39 @@ +#!/bin/bash +#SBATCH --job-name=all_reduce_perf_mpi_20240515_120300 +#SBATCH -N 2 +#SBATCH --output=/.autodirect/mtrsysgwork/eshukrun/cloudai/results/2024-05-15_12-02-40/Tests.21/0/stdout.txt +#SBATCH --error=/.autodirect/mtrsysgwork/eshukrun/cloudai/results/2024-05-15_12-02-40/Tests.21/0/stderr.txt +#SBATCH --partition=partition_1 +#SBATCH --gpus-per-node=8 +#SBATCH --ntasks-per-node=8 +#SBATCH --time=00:20:00 + +export SLURM_JOB_MASTER_NODE=$(scontrol show hostname $SLURM_JOB_NODELIST | head -n 1) + +export CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7 +export MELLANOX_VISIBLE_DEVICES=0,3,4,5,6,9,10,11 +export NCCL_IB_GID_INDEX=3 +export NCCL_IB_QPS_PER_CONNECTION=4 +export NCCL_IB_TIMEOUT=20 + +srun \ +--mpi=pmix \ +--container-image=/path/to/install/nccl-test/nccl_test.sqsh \ +/usr/local/bin/all_reduce_perf_mpi \ +--nthreads 1 \ +--ngpus 1 \ +--minbytes 128 \ +--maxbytes 16G \ +--stepbytes 1M \ +--op sum \ +--datatype float \ +--root 0 \ +--iters 100 \ +--warmup_iters 50 \ +--agg_iters 1 \ +--average 1 \ +--parallel_init 0 \ +--check 1 \ +--blocking 0 \ +--cudagraph 0 \ +--stepfactor 2 \ No newline at end of file diff --git a/ci_tools/functional_tests/scenarios_expected_outputs/nccl_test/Tests.22/0/sbatch_script.sh b/ci_tools/functional_tests/scenarios_expected_outputs/nccl_test/Tests.22/0/sbatch_script.sh new file mode 100644 index 000000000..1dc3faf84 --- /dev/null +++ b/ci_tools/functional_tests/scenarios_expected_outputs/nccl_test/Tests.22/0/sbatch_script.sh @@ -0,0 +1,40 @@ +#!/bin/bash +#SBATCH --job-name=all_gather_perf_mpi_20240515_120301 +#SBATCH -N 2 +#SBATCH --output=/.autodirect/mtrsysgwork/eshukrun/cloudai/results/2024-05-15_12-02-40/Tests.22/0/stdout.txt +#SBATCH --error=/.autodirect/mtrsysgwork/eshukrun/cloudai/results/2024-05-15_12-02-40/Tests.22/0/stderr.txt +#SBATCH --partition=partition_1 +#SBATCH --gpus-per-node=8 +#SBATCH --ntasks-per-node=8 +#SBATCH --time=00:20:00 + +export SLURM_JOB_MASTER_NODE=$(scontrol show hostname $SLURM_JOB_NODELIST | head -n 1) + +export CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7 +export MELLANOX_VISIBLE_DEVICES=0,3,4,5,6,9,10,11 +export NCCL_IB_GID_INDEX=3 +export NCCL_IB_QPS_PER_CONNECTION=4 +export NCCL_IB_TIMEOUT=20 +export NCCL_TEST_SPLIT_MASK=0x7 + +srun \ +--mpi=pmix \ +--container-image=/path/to/install/nccl-test/nccl_test.sqsh \ +/usr/local/bin/all_gather_perf_mpi \ +--nthreads 1 \ +--ngpus 1 \ +--minbytes 128 \ +--maxbytes 4G \ +--stepbytes 1M \ +--op sum \ +--datatype float \ +--root 0 \ +--iters 100 \ +--warmup_iters 50 \ +--agg_iters 1 \ +--average 1 \ +--parallel_init 0 \ +--check 1 \ +--blocking 0 \ +--cudagraph 0 \ +--stepfactor 2 \ No newline at end of file diff --git a/ci_tools/functional_tests/scenarios_expected_outputs/nccl_test/Tests.23/0/sbatch_script.sh b/ci_tools/functional_tests/scenarios_expected_outputs/nccl_test/Tests.23/0/sbatch_script.sh new file mode 100644 index 000000000..f901456bf --- /dev/null +++ b/ci_tools/functional_tests/scenarios_expected_outputs/nccl_test/Tests.23/0/sbatch_script.sh @@ -0,0 +1,40 @@ +#!/bin/bash +#SBATCH --job-name=reduce_scatter_perf_mpi_20240515_120302 +#SBATCH -N 2 +#SBATCH --output=/.autodirect/mtrsysgwork/eshukrun/cloudai/results/2024-05-15_12-02-40/Tests.23/0/stdout.txt +#SBATCH --error=/.autodirect/mtrsysgwork/eshukrun/cloudai/results/2024-05-15_12-02-40/Tests.23/0/stderr.txt +#SBATCH --partition=partition_1 +#SBATCH --gpus-per-node=8 +#SBATCH --ntasks-per-node=8 +#SBATCH --time=00:20:00 + +export SLURM_JOB_MASTER_NODE=$(scontrol show hostname $SLURM_JOB_NODELIST | head -n 1) + +export CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7 +export MELLANOX_VISIBLE_DEVICES=0,3,4,5,6,9,10,11 +export NCCL_IB_GID_INDEX=3 +export NCCL_IB_QPS_PER_CONNECTION=4 +export NCCL_IB_TIMEOUT=20 +export NCCL_TEST_SPLIT_MASK=0x7 + +srun \ +--mpi=pmix \ +--container-image=/path/to/install/nccl-test/nccl_test.sqsh \ +/usr/local/bin/reduce_scatter_perf_mpi \ +--nthreads 1 \ +--ngpus 1 \ +--minbytes 128 \ +--maxbytes 4G \ +--stepbytes 1M \ +--op sum \ +--datatype float \ +--root 0 \ +--iters 100 \ +--warmup_iters 50 \ +--agg_iters 1 \ +--average 1 \ +--parallel_init 0 \ +--check 1 \ +--blocking 0 \ +--cudagraph 0 \ +--stepfactor 2 \ No newline at end of file diff --git a/ci_tools/functional_tests/scenarios_expected_outputs/nccl_test/Tests.24/0/sbatch_script.sh b/ci_tools/functional_tests/scenarios_expected_outputs/nccl_test/Tests.24/0/sbatch_script.sh new file mode 100644 index 000000000..7fb6b04a4 --- /dev/null +++ b/ci_tools/functional_tests/scenarios_expected_outputs/nccl_test/Tests.24/0/sbatch_script.sh @@ -0,0 +1,39 @@ +#!/bin/bash +#SBATCH --job-name=alltoall_perf_mpi_20240515_120303 +#SBATCH -N 2 +#SBATCH --output=/.autodirect/mtrsysgwork/eshukrun/cloudai/results/2024-05-15_12-02-40/Tests.24/0/stdout.txt +#SBATCH --error=/.autodirect/mtrsysgwork/eshukrun/cloudai/results/2024-05-15_12-02-40/Tests.24/0/stderr.txt +#SBATCH --partition=partition_1 +#SBATCH --gpus-per-node=8 +#SBATCH --ntasks-per-node=8 +#SBATCH --time=00:20:00 + +export SLURM_JOB_MASTER_NODE=$(scontrol show hostname $SLURM_JOB_NODELIST | head -n 1) + +export CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7 +export MELLANOX_VISIBLE_DEVICES=0,3,4,5,6,9,10,11 +export NCCL_IB_GID_INDEX=3 +export NCCL_IB_QPS_PER_CONNECTION=4 +export NCCL_IB_TIMEOUT=20 + +srun \ +--mpi=pmix \ +--container-image=/path/to/install/nccl-test/nccl_test.sqsh \ +/usr/local/bin/alltoall_perf_mpi \ +--nthreads 1 \ +--ngpus 1 \ +--minbytes 128 \ +--maxbytes 4G \ +--stepbytes 1M \ +--op sum \ +--datatype float \ +--root 0 \ +--iters 100 \ +--warmup_iters 50 \ +--agg_iters 1 \ +--average 1 \ +--parallel_init 0 \ +--check 1 \ +--blocking 0 \ +--cudagraph 0 \ +--stepfactor 2 \ No newline at end of file diff --git a/ci_tools/functional_tests/scenarios_expected_outputs/nccl_test/Tests.25/0/sbatch_script.sh b/ci_tools/functional_tests/scenarios_expected_outputs/nccl_test/Tests.25/0/sbatch_script.sh new file mode 100644 index 000000000..e29b817fa --- /dev/null +++ b/ci_tools/functional_tests/scenarios_expected_outputs/nccl_test/Tests.25/0/sbatch_script.sh @@ -0,0 +1,39 @@ +#!/bin/bash +#SBATCH --job-name=bisection_perf_mpi_20240515_120304 +#SBATCH -N 2 +#SBATCH --output=/.autodirect/mtrsysgwork/eshukrun/cloudai/results/2024-05-15_12-02-40/Tests.25/0/stdout.txt +#SBATCH --error=/.autodirect/mtrsysgwork/eshukrun/cloudai/results/2024-05-15_12-02-40/Tests.25/0/stderr.txt +#SBATCH --partition=partition_1 +#SBATCH --gpus-per-node=8 +#SBATCH --ntasks-per-node=8 +#SBATCH --time=00:20:00 + +export SLURM_JOB_MASTER_NODE=$(scontrol show hostname $SLURM_JOB_NODELIST | head -n 1) + +export CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7 +export MELLANOX_VISIBLE_DEVICES=0,3,4,5,6,9,10,11 +export NCCL_IB_GID_INDEX=3 +export NCCL_IB_QPS_PER_CONNECTION=4 +export NCCL_IB_TIMEOUT=20 + +srun \ +--mpi=pmix \ +--container-image=/path/to/install/nccl-test/nccl_test.sqsh \ +/usr/local/bin/bisection_perf_mpi \ +--nthreads 1 \ +--ngpus 1 \ +--minbytes 128 \ +--maxbytes 4G \ +--stepbytes 1M \ +--op sum \ +--datatype float \ +--root 0 \ +--iters 100 \ +--warmup_iters 50 \ +--agg_iters 1 \ +--average 1 \ +--parallel_init 0 \ +--check 1 \ +--blocking 0 \ +--cudagraph 0 \ +--stepfactor 2 \ No newline at end of file diff --git a/ci_tools/functional_tests/scenarios_expected_outputs/nccl_test/Tests.3/0/sbatch_script.sh b/ci_tools/functional_tests/scenarios_expected_outputs/nccl_test/Tests.3/0/sbatch_script.sh new file mode 100644 index 000000000..1e8eaea95 --- /dev/null +++ b/ci_tools/functional_tests/scenarios_expected_outputs/nccl_test/Tests.3/0/sbatch_script.sh @@ -0,0 +1,40 @@ +#!/bin/bash +#SBATCH --job-name=reduce_scatter_perf_mpi_20240515_120241 +#SBATCH -N 2 +#SBATCH --output=/.autodirect/mtrsysgwork/eshukrun/cloudai/results/2024-05-15_12-02-40/Tests.3/0/stdout.txt +#SBATCH --error=/.autodirect/mtrsysgwork/eshukrun/cloudai/results/2024-05-15_12-02-40/Tests.3/0/stderr.txt +#SBATCH --partition=partition_1 +#SBATCH --gpus-per-node=8 +#SBATCH --ntasks-per-node=8 +#SBATCH --time=00:20:00 + +export SLURM_JOB_MASTER_NODE=$(scontrol show hostname $SLURM_JOB_NODELIST | head -n 1) + +export CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7 +export MELLANOX_VISIBLE_DEVICES=0,3,4,5,6,9,10,11 +export NCCL_IB_GID_INDEX=3 +export NCCL_IB_QPS_PER_CONNECTION=4 +export NCCL_IB_TIMEOUT=20 +export NCCL_TEST_SPLIT_MASK=0x7 + +srun \ +--mpi=pmix \ +--container-image=/path/to/install/nccl-test/nccl_test.sqsh \ +/usr/local/bin/reduce_scatter_perf_mpi \ +--nthreads 1 \ +--ngpus 1 \ +--minbytes 128 \ +--maxbytes 4G \ +--stepbytes 1M \ +--op sum \ +--datatype float \ +--root 0 \ +--iters 100 \ +--warmup_iters 50 \ +--agg_iters 1 \ +--average 1 \ +--parallel_init 0 \ +--check 1 \ +--blocking 0 \ +--cudagraph 0 \ +--stepfactor 2 \ No newline at end of file diff --git a/ci_tools/functional_tests/scenarios_expected_outputs/nccl_test/Tests.4/0/sbatch_script.sh b/ci_tools/functional_tests/scenarios_expected_outputs/nccl_test/Tests.4/0/sbatch_script.sh new file mode 100644 index 000000000..7d3b70e39 --- /dev/null +++ b/ci_tools/functional_tests/scenarios_expected_outputs/nccl_test/Tests.4/0/sbatch_script.sh @@ -0,0 +1,39 @@ +#!/bin/bash +#SBATCH --job-name=alltoall_perf_mpi_20240515_120242 +#SBATCH -N 2 +#SBATCH --output=/.autodirect/mtrsysgwork/eshukrun/cloudai/results/2024-05-15_12-02-40/Tests.4/0/stdout.txt +#SBATCH --error=/.autodirect/mtrsysgwork/eshukrun/cloudai/results/2024-05-15_12-02-40/Tests.4/0/stderr.txt +#SBATCH --partition=partition_1 +#SBATCH --gpus-per-node=8 +#SBATCH --ntasks-per-node=8 +#SBATCH --time=00:20:00 + +export SLURM_JOB_MASTER_NODE=$(scontrol show hostname $SLURM_JOB_NODELIST | head -n 1) + +export CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7 +export MELLANOX_VISIBLE_DEVICES=0,3,4,5,6,9,10,11 +export NCCL_IB_GID_INDEX=3 +export NCCL_IB_QPS_PER_CONNECTION=4 +export NCCL_IB_TIMEOUT=20 + +srun \ +--mpi=pmix \ +--container-image=/path/to/install/nccl-test/nccl_test.sqsh \ +/usr/local/bin/alltoall_perf_mpi \ +--nthreads 1 \ +--ngpus 1 \ +--minbytes 128 \ +--maxbytes 4G \ +--stepbytes 1M \ +--op sum \ +--datatype float \ +--root 0 \ +--iters 100 \ +--warmup_iters 50 \ +--agg_iters 1 \ +--average 1 \ +--parallel_init 0 \ +--check 1 \ +--blocking 0 \ +--cudagraph 0 \ +--stepfactor 2 \ No newline at end of file diff --git a/ci_tools/functional_tests/scenarios_expected_outputs/nccl_test/Tests.5/0/sbatch_script.sh b/ci_tools/functional_tests/scenarios_expected_outputs/nccl_test/Tests.5/0/sbatch_script.sh new file mode 100644 index 000000000..f8e5b0114 --- /dev/null +++ b/ci_tools/functional_tests/scenarios_expected_outputs/nccl_test/Tests.5/0/sbatch_script.sh @@ -0,0 +1,39 @@ +#!/bin/bash +#SBATCH --job-name=bisection_perf_mpi_20240515_120243 +#SBATCH -N 2 +#SBATCH --output=/.autodirect/mtrsysgwork/eshukrun/cloudai/results/2024-05-15_12-02-40/Tests.5/0/stdout.txt +#SBATCH --error=/.autodirect/mtrsysgwork/eshukrun/cloudai/results/2024-05-15_12-02-40/Tests.5/0/stderr.txt +#SBATCH --partition=partition_1 +#SBATCH --gpus-per-node=8 +#SBATCH --ntasks-per-node=8 +#SBATCH --time=00:20:00 + +export SLURM_JOB_MASTER_NODE=$(scontrol show hostname $SLURM_JOB_NODELIST | head -n 1) + +export CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7 +export MELLANOX_VISIBLE_DEVICES=0,3,4,5,6,9,10,11 +export NCCL_IB_GID_INDEX=3 +export NCCL_IB_QPS_PER_CONNECTION=4 +export NCCL_IB_TIMEOUT=20 + +srun \ +--mpi=pmix \ +--container-image=/path/to/install/nccl-test/nccl_test.sqsh \ +/usr/local/bin/bisection_perf_mpi \ +--nthreads 1 \ +--ngpus 1 \ +--minbytes 128 \ +--maxbytes 4G \ +--stepbytes 1M \ +--op sum \ +--datatype float \ +--root 0 \ +--iters 100 \ +--warmup_iters 50 \ +--agg_iters 1 \ +--average 1 \ +--parallel_init 0 \ +--check 1 \ +--blocking 0 \ +--cudagraph 0 \ +--stepfactor 2 \ No newline at end of file diff --git a/ci_tools/functional_tests/scenarios_expected_outputs/nccl_test/Tests.6/0/sbatch_script.sh b/ci_tools/functional_tests/scenarios_expected_outputs/nccl_test/Tests.6/0/sbatch_script.sh new file mode 100644 index 000000000..2e3027235 --- /dev/null +++ b/ci_tools/functional_tests/scenarios_expected_outputs/nccl_test/Tests.6/0/sbatch_script.sh @@ -0,0 +1,39 @@ +#!/bin/bash +#SBATCH --job-name=all_reduce_perf_mpi_20240515_120244 +#SBATCH -N 2 +#SBATCH --output=/.autodirect/mtrsysgwork/eshukrun/cloudai/results/2024-05-15_12-02-40/Tests.6/0/stdout.txt +#SBATCH --error=/.autodirect/mtrsysgwork/eshukrun/cloudai/results/2024-05-15_12-02-40/Tests.6/0/stderr.txt +#SBATCH --partition=partition_1 +#SBATCH --gpus-per-node=8 +#SBATCH --ntasks-per-node=8 +#SBATCH --time=00:20:00 + +export SLURM_JOB_MASTER_NODE=$(scontrol show hostname $SLURM_JOB_NODELIST | head -n 1) + +export CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7 +export MELLANOX_VISIBLE_DEVICES=0,3,4,5,6,9,10,11 +export NCCL_IB_GID_INDEX=3 +export NCCL_IB_QPS_PER_CONNECTION=4 +export NCCL_IB_TIMEOUT=20 + +srun \ +--mpi=pmix \ +--container-image=/path/to/install/nccl-test/nccl_test.sqsh \ +/usr/local/bin/all_reduce_perf_mpi \ +--nthreads 1 \ +--ngpus 1 \ +--minbytes 128 \ +--maxbytes 16G \ +--stepbytes 1M \ +--op sum \ +--datatype float \ +--root 0 \ +--iters 100 \ +--warmup_iters 50 \ +--agg_iters 1 \ +--average 1 \ +--parallel_init 0 \ +--check 1 \ +--blocking 0 \ +--cudagraph 0 \ +--stepfactor 2 \ No newline at end of file diff --git a/ci_tools/functional_tests/scenarios_expected_outputs/nccl_test/Tests.7/0/sbatch_script.sh b/ci_tools/functional_tests/scenarios_expected_outputs/nccl_test/Tests.7/0/sbatch_script.sh new file mode 100644 index 000000000..378bc66fb --- /dev/null +++ b/ci_tools/functional_tests/scenarios_expected_outputs/nccl_test/Tests.7/0/sbatch_script.sh @@ -0,0 +1,40 @@ +#!/bin/bash +#SBATCH --job-name=all_gather_perf_mpi_20240515_120245 +#SBATCH -N 2 +#SBATCH --output=/.autodirect/mtrsysgwork/eshukrun/cloudai/results/2024-05-15_12-02-40/Tests.7/0/stdout.txt +#SBATCH --error=/.autodirect/mtrsysgwork/eshukrun/cloudai/results/2024-05-15_12-02-40/Tests.7/0/stderr.txt +#SBATCH --partition=partition_1 +#SBATCH --gpus-per-node=8 +#SBATCH --ntasks-per-node=8 +#SBATCH --time=00:20:00 + +export SLURM_JOB_MASTER_NODE=$(scontrol show hostname $SLURM_JOB_NODELIST | head -n 1) + +export CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7 +export MELLANOX_VISIBLE_DEVICES=0,3,4,5,6,9,10,11 +export NCCL_IB_GID_INDEX=3 +export NCCL_IB_QPS_PER_CONNECTION=4 +export NCCL_IB_TIMEOUT=20 +export NCCL_TEST_SPLIT_MASK=0x7 + +srun \ +--mpi=pmix \ +--container-image=/path/to/install/nccl-test/nccl_test.sqsh \ +/usr/local/bin/all_gather_perf_mpi \ +--nthreads 1 \ +--ngpus 1 \ +--minbytes 128 \ +--maxbytes 4G \ +--stepbytes 1M \ +--op sum \ +--datatype float \ +--root 0 \ +--iters 100 \ +--warmup_iters 50 \ +--agg_iters 1 \ +--average 1 \ +--parallel_init 0 \ +--check 1 \ +--blocking 0 \ +--cudagraph 0 \ +--stepfactor 2 \ No newline at end of file diff --git a/ci_tools/functional_tests/scenarios_expected_outputs/nccl_test/Tests.8/0/sbatch_script.sh b/ci_tools/functional_tests/scenarios_expected_outputs/nccl_test/Tests.8/0/sbatch_script.sh new file mode 100644 index 000000000..2fd85769f --- /dev/null +++ b/ci_tools/functional_tests/scenarios_expected_outputs/nccl_test/Tests.8/0/sbatch_script.sh @@ -0,0 +1,40 @@ +#!/bin/bash +#SBATCH --job-name=reduce_scatter_perf_mpi_20240515_120246 +#SBATCH -N 2 +#SBATCH --output=/.autodirect/mtrsysgwork/eshukrun/cloudai/results/2024-05-15_12-02-40/Tests.8/0/stdout.txt +#SBATCH --error=/.autodirect/mtrsysgwork/eshukrun/cloudai/results/2024-05-15_12-02-40/Tests.8/0/stderr.txt +#SBATCH --partition=partition_1 +#SBATCH --gpus-per-node=8 +#SBATCH --ntasks-per-node=8 +#SBATCH --time=00:20:00 + +export SLURM_JOB_MASTER_NODE=$(scontrol show hostname $SLURM_JOB_NODELIST | head -n 1) + +export CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7 +export MELLANOX_VISIBLE_DEVICES=0,3,4,5,6,9,10,11 +export NCCL_IB_GID_INDEX=3 +export NCCL_IB_QPS_PER_CONNECTION=4 +export NCCL_IB_TIMEOUT=20 +export NCCL_TEST_SPLIT_MASK=0x7 + +srun \ +--mpi=pmix \ +--container-image=/path/to/install/nccl-test/nccl_test.sqsh \ +/usr/local/bin/reduce_scatter_perf_mpi \ +--nthreads 1 \ +--ngpus 1 \ +--minbytes 128 \ +--maxbytes 4G \ +--stepbytes 1M \ +--op sum \ +--datatype float \ +--root 0 \ +--iters 100 \ +--warmup_iters 50 \ +--agg_iters 1 \ +--average 1 \ +--parallel_init 0 \ +--check 1 \ +--blocking 0 \ +--cudagraph 0 \ +--stepfactor 2 \ No newline at end of file diff --git a/ci_tools/functional_tests/scenarios_expected_outputs/nccl_test/Tests.9/0/sbatch_script.sh b/ci_tools/functional_tests/scenarios_expected_outputs/nccl_test/Tests.9/0/sbatch_script.sh new file mode 100644 index 000000000..b91234119 --- /dev/null +++ b/ci_tools/functional_tests/scenarios_expected_outputs/nccl_test/Tests.9/0/sbatch_script.sh @@ -0,0 +1,39 @@ +#!/bin/bash +#SBATCH --job-name=alltoall_perf_mpi_20240515_120247 +#SBATCH -N 2 +#SBATCH --output=/.autodirect/mtrsysgwork/eshukrun/cloudai/results/2024-05-15_12-02-40/Tests.9/0/stdout.txt +#SBATCH --error=/.autodirect/mtrsysgwork/eshukrun/cloudai/results/2024-05-15_12-02-40/Tests.9/0/stderr.txt +#SBATCH --partition=partition_1 +#SBATCH --gpus-per-node=8 +#SBATCH --ntasks-per-node=8 +#SBATCH --time=00:20:00 + +export SLURM_JOB_MASTER_NODE=$(scontrol show hostname $SLURM_JOB_NODELIST | head -n 1) + +export CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7 +export MELLANOX_VISIBLE_DEVICES=0,3,4,5,6,9,10,11 +export NCCL_IB_GID_INDEX=3 +export NCCL_IB_QPS_PER_CONNECTION=4 +export NCCL_IB_TIMEOUT=20 + +srun \ +--mpi=pmix \ +--container-image=/path/to/install/nccl-test/nccl_test.sqsh \ +/usr/local/bin/alltoall_perf_mpi \ +--nthreads 1 \ +--ngpus 1 \ +--minbytes 128 \ +--maxbytes 4G \ +--stepbytes 1M \ +--op sum \ +--datatype float \ +--root 0 \ +--iters 100 \ +--warmup_iters 50 \ +--agg_iters 1 \ +--average 1 \ +--parallel_init 0 \ +--check 1 \ +--blocking 0 \ +--cudagraph 0 \ +--stepfactor 2 \ No newline at end of file diff --git a/ci_tools/functional_tests/system_config.toml b/ci_tools/functional_tests/system_config.toml new file mode 100644 index 000000000..3e17f9ef6 --- /dev/null +++ b/ci_tools/functional_tests/system_config.toml @@ -0,0 +1,45 @@ +name = "example-cluster" +scheduler = "slurm" + +install_path = "/path/to/install" +output_path = "./results" +default_partition = "partition_1" + +gpus_per_node = 8 +ntasks_per_node = 8 + +[partitions] + [partitions.partition_1] + name = "partition_1" + nodes = ["node-[001-100]"] + + [partitions.partition_2] + name = "partition_2" + nodes = ["node-[101-200]"] + + [partitions.partition_1.groups] + [partitions.partition_1.groups.group_1] + name = "group_1" + nodes = ["node-[001-025]"] + + [partitions.partition_1.groups.group_2] + name = "group_2" + nodes = ["node-[026-050]"] + + [partitions.partition_1.groups.group_3] + name = "group_3" + nodes = ["node-[051-075]"] + + [partitions.partition_1.groups.group_4] + name = "group_4" + nodes = ["node-[076-100]"] + +[global_env_vars] + # NCCL Specific Configurations + NCCL_IB_GID_INDEX = "3" + NCCL_IB_TIMEOUT = "20" + NCCL_IB_QPS_PER_CONNECTION = "4" + + # Device Visibility Configuration + MELLANOX_VISIBLE_DEVICES = "0,3,4,5,6,9,10,11" + CUDA_VISIBLE_DEVICES = "0,1,2,3,4,5,6,7"