Skip to content

Commit

Permalink
Execute latest version of NCCL tests
Browse files Browse the repository at this point in the history
Upgrade NCCL version, NCCL Benchmarks versions and OFI NCCL version.
* NCCL_BENCHMARKS_VERSION from 2.10.0 to 2.13.8
* NCCL_VERSION from 2.7.8-1 to 2.19.4-1
* OFI_NCCL_VERSION from 1.1.1 to 1.7.4-aws

### Tests

Print test output.

Improve test assertion to be more robust by searching for specific packet size 1073741824
and specific number of elements 268435456.

Signed-off-by: Enrico Usai <[email protected]>
  • Loading branch information
enrico-usai committed Dec 19, 2023
1 parent d890512 commit e51d786
Show file tree
Hide file tree
Showing 3 changed files with 28 additions and 8 deletions.
23 changes: 21 additions & 2 deletions tests/integration-tests/tests/efa/test_efa.py
Original file line number Diff line number Diff line change
Expand Up @@ -338,9 +338,28 @@ def _test_nccl_benchmarks(remote_command_executor, test_datadir, mpi_module, sch
scheduler_commands.wait_job_completed(job_id)
scheduler_commands.assert_job_succeeded(job_id)

result = remote_command_executor.run_remote_command("cat /shared/nccl_tests.out")
logging.info(f"Test result is: {result}")

# Expected output with NCCL_BENCHMARKS_VERSION='2.10.0', NCCL_VERSION='2.7.8-1' and OFI_NCCL_VERSION='1.1.1':
# out-of-place in-place
# size count type redop time algbw busbw error time algbw busbw error
# (B) (elements) (us) (GB/s) (GB/s) (us) (GB/s) (GB/s)
# ...
# 1073741824 268435456 float sum 79531 13.50 26.58 2e-06 79371 13.53 26.63 2e-06
#
# --------
# Expected output with NCCL_BENCHMARKS_VERSION='2.13.8', NCCL_VERSION='2.19.4-1' and OFI_NCCL_VERSION='1.7.4-aws':
# out-of-place in-place
# size count type redop root time algbw busbw #wrong time algbw busbw #wrong
# (B) (elements) (us) (GB/s) (GB/s) (us) (GB/s) (GB/s)
# ...
# 1073741824 268435456 float sum -1 44023 24.39 45.73 0 43947 24.43 45.81 0

# We are looking for packet size 1073741824, 268435456 elements and in-place busbw (GB/s).
max_bandwidth = remote_command_executor.run_remote_command(
"cat /shared/nccl_tests.out | tail -4 | head -1 | awk '{print $11}'"
"cat /shared/nccl_tests.out | grep -E '1073741824\\s+268435456' | awk '{print $12}'"
).stdout

# Expected bandwidth with 2 nodes, 8 tasks per node is about 27GB/s
# Expected "in-place busbw" bandwidth with 2 nodes, 8 tasks per node is about 27GB/s
assert_that(float(max_bandwidth)).is_greater_than(26.0)
Original file line number Diff line number Diff line change
Expand Up @@ -4,9 +4,9 @@ set -e
rm -rf /shared/${1}

module load ${1}
NCCL_BENCHMARKS_VERSION='2.10.0'
NCCL_VERSION='2.7.8-1'
OFI_NCCL_VERSION='1.1.1'
NCCL_BENCHMARKS_VERSION='2.13.8'
NCCL_VERSION='2.19.4-1'
OFI_NCCL_VERSION='1.7.4-aws'
MPI_HOME=$(which mpirun | awk -F '/bin' '{print $1}')
NVCC_GENCODE="-gencode=arch=compute_80,code=sm_80" # Arch for NVIDIA A100

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -3,9 +3,10 @@
#SBATCH --exclusive
#SBATCH --ntasks-per-node=8


module load openmpi
NCCL_VERSION='2.7.8-1'
NCCL_VERSION='2.19.4-1'
NCCL_BENCHMARKS_VERSION='2.13.8'

mpirun \
-x FI_PROVIDER="efa" \
-x FI_EFA_USE_DEVICE_RDMA=1 \
Expand All @@ -15,4 +16,4 @@ mpirun \
-x NCCL_DEBUG=WARNING \
-x NCCL_PROTO=simple \
--mca pml ^cm --mca btl tcp,self --mca btl_tcp_if_exclude lo,docker0 --bind-to none \
/shared/openmpi/nccl-tests-2.10.0/build/all_reduce_perf -b 8 -e 1G -f 2 -g 1 -c 1 -n 100 > /shared/nccl_tests.out
/shared/openmpi/nccl-tests-${NCCL_BENCHMARKS_VERSION}/build/all_reduce_perf -b 8 -e 1G -f 2 -g 1 -c 1 -n 100 > /shared/nccl_tests.out

0 comments on commit e51d786

Please sign in to comment.