-
Notifications
You must be signed in to change notification settings - Fork 20
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Add infrastructure for functional tests in CI pipeline and NCCL test …
…example
- Loading branch information
1 parent
eb50e83
commit 9fe15c1
Showing
29 changed files
with
1,182 additions
and
0 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,41 @@ | ||
name: Functional Tests | ||
|
||
on: [push, pull_request] | ||
|
||
jobs: | ||
test: | ||
runs-on: ubuntu-latest | ||
strategy: | ||
matrix: | ||
include: | ||
- scenario_path: "conf/v0.6/general/test_scenario/nccl_test/test_scenario.toml" | ||
expected_output_path: "ci_tools/functional_tests/scenarios_expected_outputs/nccl_test" | ||
# Add your new test here | ||
|
||
steps: | ||
- name: Checkout code | ||
uses: actions/checkout@v4 | ||
|
||
- name: Set up Python | ||
uses: actions/setup-python@v5 | ||
with: | ||
python-version: 3.9 | ||
cache: 'pip' | ||
cache-dependency-path: | | ||
**/requirements*.txt | ||
|
||
|
||
- name: Install dependencies | ||
run: | | ||
pip install -r requirements-dev.txt | ||
pip install requirements.txt | ||
pip install . | ||
|
||
|
||
- name: Run tests | ||
run: | | ||
bash ci_tools/functional_tests/run_functional_test.sh ${{ matrix.scenario_path }} ${{ matrix.expected_output_path }} | ||
if [ $? -ne 0 ]; then | ||
echo "Test ${{ matrix.scenario_path }} failed" | ||
exit 1 | ||
fi |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,20 @@ | ||
The functional test use the dry-run mode and compare the output with an expected output. | ||
|
||
### The test | ||
The test receives two parameters: | ||
- The path to the scenario that need to be executed | ||
- The path to the expected output directory, this directory should be the same as the result directory written by the dry-run execution of the scenario | ||
|
||
It execute the scenario in dry-run mode and compare the results folder with the expected output folder, if any difference is found in one of the files, the test fails. | ||
**Note that the empty lines and commented lines (the ones that start with #) are not taken into account when computing the difference.** | ||
|
||
### The workflow | ||
The workflow uses a strategy matrix to define the tests it should run, every tests run in parallel. | ||
Note that the python setup is cached and therefore will run only if the requirements-dev.txt file is changed. | ||
|
||
### Add a new test | ||
To add a new test: | ||
- Create the desired scenario | ||
- Create a folder under `ci_tools/functional_tests/scenarios_expected_outputs/` named after the name of this test and fill it with the expected output of the scenario (you can copy the output of a valid dry run) | ||
- In the pipeline file (`.github/workflows/functional_test.yml`), add your new test in `jobs.test.strategy.matrix.include` (see the comment saying "Add your new test here"), follow the syntax of the other tests | ||
|
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,91 @@ | ||
#!/bin/bash | ||
|
||
VERBOSE=true | ||
|
||
if [ $# -ne 2 ]; then | ||
echo "Usage: $0 <test_scenario_path> <expected_output_path>" | ||
echo "test_scenario_path: the path of the test scenario" | ||
echo "expected_output_path: the path of the directory containing the expected result of the test" | ||
echo "Example: $0 conf/v0.6/general/test_scenario/nccl_test/test_scenario.toml ci_tools/functional_tests/scenarios_expected_outputs/nccl_test/" | ||
exit 1 | ||
fi | ||
|
||
files_diff() { | ||
local file1="$1" | ||
local file2="$2" | ||
# Ignore: | ||
# - Empty lines | ||
# - Commented lines (start with #) | ||
diff <(grep -v '^\s*#' <(grep -v '^[[:space:]]*$' "$file1")) <(grep -v '^\s*#' <(grep -v '^[[:space:]]*$' "$file2")) > /dev/null | ||
} | ||
|
||
# recursively compare directories using files_diff | ||
dirs_diff() { | ||
local error=false | ||
local dir1="$1" | ||
local dir2="$2" | ||
|
||
local files1=$(find "$dir1" -type f) | ||
local files2=$(find "$dir2" -type f) | ||
|
||
if [ "$(echo "$files1" | wc -l)" -ne "$(echo "$files2" | wc -l)" ]; then | ||
>&2 echo "Directories have different count of files." | ||
error=true | ||
fi | ||
|
||
while IFS= read -r file1; do | ||
local file2=$(echo "$file1" | sed "s|^$dir1|$dir2|") | ||
if [ ! -f "$file2" ]; then | ||
>&2 echo "File $file2 does not exist in $dir2." | ||
error=true | ||
fi | ||
|
||
if ! files_diff "$file1" "$file2"; then | ||
>&2 echo "Files $file1 and $file2 have different contents." | ||
error=true | ||
fi | ||
done <<< "$files1" | ||
|
||
if $error; then | ||
return 1 | ||
fi | ||
} | ||
|
||
|
||
scenario_path="$1" | ||
expected_output_path="$2" | ||
|
||
if [ ! -f "$scenario_path" ]; then | ||
>&2 echo "Error: Scenario $scenario is not valid, can't find path $scenario_path." | ||
exit 1 | ||
fi | ||
|
||
[ ! -d "results" ] && mkdir results | ||
|
||
last_result_before=$(ls results/ -la -X | tail -n 3 | head -n 1 | awk '{print $NF}') | ||
|
||
python main.py \ | ||
--mode dry-run\ | ||
--system_config_path "ci_tools/functional_tests/system_config.toml" \ | ||
--test_scenario_path $scenario_path | ||
|
||
last_result=$(ls results/ -la -X | tail -n 3 | head -n 1 | awk '{print $NF}') | ||
|
||
if [ "$last_result_before" == "$last_result" ]; then | ||
>&2 echo "No new result added after running cloudai dry run." | ||
exit 1 | ||
fi | ||
|
||
last_result_path="results/$last_result" | ||
|
||
dirs_diff "$expected_output_path" "$last_result_path" | ||
is_diff=$? | ||
|
||
if [ $is_diff -eq 1 ]; then | ||
>&2 echo "Result output is not as expected." | ||
exit 1 | ||
fi | ||
|
||
$VERBOSE && echo "Test ran successfully" | ||
|
||
exit 0 |
39 changes: 39 additions & 0 deletions
39
ci_tools/functional_tests/scenarios_expected_outputs/nccl_test/Tests.1/0/sbatch_script.sh
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,39 @@ | ||
#!/bin/bash | ||
#SBATCH --job-name=all_reduce_perf_mpi_20240515_120240 | ||
#SBATCH -N 2 | ||
#SBATCH --output=/.autodirect/mtrsysgwork/eshukrun/cloudai/results/2024-05-15_12-02-40/Tests.1/0/stdout.txt | ||
#SBATCH --error=/.autodirect/mtrsysgwork/eshukrun/cloudai/results/2024-05-15_12-02-40/Tests.1/0/stderr.txt | ||
#SBATCH --partition=partition_1 | ||
#SBATCH --gpus-per-node=8 | ||
#SBATCH --ntasks-per-node=8 | ||
#SBATCH --time=00:20:00 | ||
|
||
export SLURM_JOB_MASTER_NODE=$(scontrol show hostname $SLURM_JOB_NODELIST | head -n 1) | ||
|
||
export CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7 | ||
export MELLANOX_VISIBLE_DEVICES=0,3,4,5,6,9,10,11 | ||
export NCCL_IB_GID_INDEX=3 | ||
export NCCL_IB_QPS_PER_CONNECTION=4 | ||
export NCCL_IB_TIMEOUT=20 | ||
|
||
srun \ | ||
--mpi=pmix \ | ||
--container-image=/path/to/install/nccl-test/nccl_test.sqsh \ | ||
/usr/local/bin/all_reduce_perf_mpi \ | ||
--nthreads 1 \ | ||
--ngpus 1 \ | ||
--minbytes 128 \ | ||
--maxbytes 16G \ | ||
--stepbytes 1M \ | ||
--op sum \ | ||
--datatype float \ | ||
--root 0 \ | ||
--iters 100 \ | ||
--warmup_iters 50 \ | ||
--agg_iters 1 \ | ||
--average 1 \ | ||
--parallel_init 0 \ | ||
--check 1 \ | ||
--blocking 0 \ | ||
--cudagraph 0 \ | ||
--stepfactor 2 |
39 changes: 39 additions & 0 deletions
39
ci_tools/functional_tests/scenarios_expected_outputs/nccl_test/Tests.10/0/sbatch_script.sh
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,39 @@ | ||
#!/bin/bash | ||
#SBATCH --job-name=bisection_perf_mpi_20240515_120248 | ||
#SBATCH -N 2 | ||
#SBATCH --output=/.autodirect/mtrsysgwork/eshukrun/cloudai/results/2024-05-15_12-02-40/Tests.10/0/stdout.txt | ||
#SBATCH --error=/.autodirect/mtrsysgwork/eshukrun/cloudai/results/2024-05-15_12-02-40/Tests.10/0/stderr.txt | ||
#SBATCH --partition=partition_1 | ||
#SBATCH --gpus-per-node=8 | ||
#SBATCH --ntasks-per-node=8 | ||
#SBATCH --time=00:20:00 | ||
|
||
export SLURM_JOB_MASTER_NODE=$(scontrol show hostname $SLURM_JOB_NODELIST | head -n 1) | ||
|
||
export CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7 | ||
export MELLANOX_VISIBLE_DEVICES=0,3,4,5,6,9,10,11 | ||
export NCCL_IB_GID_INDEX=3 | ||
export NCCL_IB_QPS_PER_CONNECTION=4 | ||
export NCCL_IB_TIMEOUT=20 | ||
|
||
srun \ | ||
--mpi=pmix \ | ||
--container-image=/path/to/install/nccl-test/nccl_test.sqsh \ | ||
/usr/local/bin/bisection_perf_mpi \ | ||
--nthreads 1 \ | ||
--ngpus 1 \ | ||
--minbytes 128 \ | ||
--maxbytes 4G \ | ||
--stepbytes 1M \ | ||
--op sum \ | ||
--datatype float \ | ||
--root 0 \ | ||
--iters 100 \ | ||
--warmup_iters 50 \ | ||
--agg_iters 1 \ | ||
--average 1 \ | ||
--parallel_init 0 \ | ||
--check 1 \ | ||
--blocking 0 \ | ||
--cudagraph 0 \ | ||
--stepfactor 2 |
39 changes: 39 additions & 0 deletions
39
ci_tools/functional_tests/scenarios_expected_outputs/nccl_test/Tests.11/0/sbatch_script.sh
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,39 @@ | ||
#!/bin/bash | ||
#SBATCH --job-name=all_reduce_perf_mpi_20240515_120249 | ||
#SBATCH -N 2 | ||
#SBATCH --output=/.autodirect/mtrsysgwork/eshukrun/cloudai/results/2024-05-15_12-02-40/Tests.11/0/stdout.txt | ||
#SBATCH --error=/.autodirect/mtrsysgwork/eshukrun/cloudai/results/2024-05-15_12-02-40/Tests.11/0/stderr.txt | ||
#SBATCH --partition=partition_1 | ||
#SBATCH --gpus-per-node=8 | ||
#SBATCH --ntasks-per-node=8 | ||
#SBATCH --time=00:20:00 | ||
|
||
export SLURM_JOB_MASTER_NODE=$(scontrol show hostname $SLURM_JOB_NODELIST | head -n 1) | ||
|
||
export CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7 | ||
export MELLANOX_VISIBLE_DEVICES=0,3,4,5,6,9,10,11 | ||
export NCCL_IB_GID_INDEX=3 | ||
export NCCL_IB_QPS_PER_CONNECTION=4 | ||
export NCCL_IB_TIMEOUT=20 | ||
|
||
srun \ | ||
--mpi=pmix \ | ||
--container-image=/path/to/install/nccl-test/nccl_test.sqsh \ | ||
/usr/local/bin/all_reduce_perf_mpi \ | ||
--nthreads 1 \ | ||
--ngpus 1 \ | ||
--minbytes 128 \ | ||
--maxbytes 16G \ | ||
--stepbytes 1M \ | ||
--op sum \ | ||
--datatype float \ | ||
--root 0 \ | ||
--iters 100 \ | ||
--warmup_iters 50 \ | ||
--agg_iters 1 \ | ||
--average 1 \ | ||
--parallel_init 0 \ | ||
--check 1 \ | ||
--blocking 0 \ | ||
--cudagraph 0 \ | ||
--stepfactor 2 |
40 changes: 40 additions & 0 deletions
40
ci_tools/functional_tests/scenarios_expected_outputs/nccl_test/Tests.12/0/sbatch_script.sh
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,40 @@ | ||
#!/bin/bash | ||
#SBATCH --job-name=all_gather_perf_mpi_20240515_120250 | ||
#SBATCH -N 2 | ||
#SBATCH --output=/.autodirect/mtrsysgwork/eshukrun/cloudai/results/2024-05-15_12-02-40/Tests.12/0/stdout.txt | ||
#SBATCH --error=/.autodirect/mtrsysgwork/eshukrun/cloudai/results/2024-05-15_12-02-40/Tests.12/0/stderr.txt | ||
#SBATCH --partition=partition_1 | ||
#SBATCH --gpus-per-node=8 | ||
#SBATCH --ntasks-per-node=8 | ||
#SBATCH --time=00:20:00 | ||
|
||
export SLURM_JOB_MASTER_NODE=$(scontrol show hostname $SLURM_JOB_NODELIST | head -n 1) | ||
|
||
export CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7 | ||
export MELLANOX_VISIBLE_DEVICES=0,3,4,5,6,9,10,11 | ||
export NCCL_IB_GID_INDEX=3 | ||
export NCCL_IB_QPS_PER_CONNECTION=4 | ||
export NCCL_IB_TIMEOUT=20 | ||
export NCCL_TEST_SPLIT_MASK=0x7 | ||
|
||
srun \ | ||
--mpi=pmix \ | ||
--container-image=/path/to/install/nccl-test/nccl_test.sqsh \ | ||
/usr/local/bin/all_gather_perf_mpi \ | ||
--nthreads 1 \ | ||
--ngpus 1 \ | ||
--minbytes 128 \ | ||
--maxbytes 4G \ | ||
--stepbytes 1M \ | ||
--op sum \ | ||
--datatype float \ | ||
--root 0 \ | ||
--iters 100 \ | ||
--warmup_iters 50 \ | ||
--agg_iters 1 \ | ||
--average 1 \ | ||
--parallel_init 0 \ | ||
--check 1 \ | ||
--blocking 0 \ | ||
--cudagraph 0 \ | ||
--stepfactor 2 |
40 changes: 40 additions & 0 deletions
40
ci_tools/functional_tests/scenarios_expected_outputs/nccl_test/Tests.13/0/sbatch_script.sh
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,40 @@ | ||
#!/bin/bash | ||
#SBATCH --job-name=reduce_scatter_perf_mpi_20240515_120251 | ||
#SBATCH -N 2 | ||
#SBATCH --output=/.autodirect/mtrsysgwork/eshukrun/cloudai/results/2024-05-15_12-02-40/Tests.13/0/stdout.txt | ||
#SBATCH --error=/.autodirect/mtrsysgwork/eshukrun/cloudai/results/2024-05-15_12-02-40/Tests.13/0/stderr.txt | ||
#SBATCH --partition=partition_1 | ||
#SBATCH --gpus-per-node=8 | ||
#SBATCH --ntasks-per-node=8 | ||
#SBATCH --time=00:20:00 | ||
|
||
export SLURM_JOB_MASTER_NODE=$(scontrol show hostname $SLURM_JOB_NODELIST | head -n 1) | ||
|
||
export CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7 | ||
export MELLANOX_VISIBLE_DEVICES=0,3,4,5,6,9,10,11 | ||
export NCCL_IB_GID_INDEX=3 | ||
export NCCL_IB_QPS_PER_CONNECTION=4 | ||
export NCCL_IB_TIMEOUT=20 | ||
export NCCL_TEST_SPLIT_MASK=0x7 | ||
|
||
srun \ | ||
--mpi=pmix \ | ||
--container-image=/path/to/install/nccl-test/nccl_test.sqsh \ | ||
/usr/local/bin/reduce_scatter_perf_mpi \ | ||
--nthreads 1 \ | ||
--ngpus 1 \ | ||
--minbytes 128 \ | ||
--maxbytes 4G \ | ||
--stepbytes 1M \ | ||
--op sum \ | ||
--datatype float \ | ||
--root 0 \ | ||
--iters 100 \ | ||
--warmup_iters 50 \ | ||
--agg_iters 1 \ | ||
--average 1 \ | ||
--parallel_init 0 \ | ||
--check 1 \ | ||
--blocking 0 \ | ||
--cudagraph 0 \ | ||
--stepfactor 2 |
Oops, something went wrong.