Merge branch 'main' into chhwang/code-coverage

microsoft · Mar 27, 2024 · 6fe900f · 6fe900f
2 parents 9745fc3 + 5ba6ce0
commit 6fe900f
Show file tree

Hide file tree

Showing 140 changed files with 5,542 additions and 1,662 deletions.
diff --git a/.azure-pipelines/integration-test.yml b/.azure-pipelines/integration-test.yml
@@ -13,9 +13,9 @@ jobs:
   strategy:
     matrix:
       cuda11:
-        containerImage: ghcr.io/microsoft/mscclpp/mscclpp:base-cuda11.8
+        containerImage: ghcr.io/microsoft/mscclpp/mscclpp:base-dev-cuda11.8
       cuda12:
-        containerImage: ghcr.io/microsoft/mscclpp/mscclpp:base-cuda12.1
+        containerImage: ghcr.io/microsoft/mscclpp/mscclpp:base-dev-cuda12.2
 
   pool:
     name: mscclpp
@@ -30,10 +30,8 @@ jobs:
     inputs:
       targetType: 'inline'
       script: |
-        curl -L https://github.com/Kitware/CMake/releases/download/v3.26.4/cmake-3.26.4-linux-x86_64.tar.gz -o /tmp/cmake-3.26.4-linux-x86_64.tar.gz
-        tar xzf /tmp/cmake-3.26.4-linux-x86_64.tar.gz -C /tmp
         mkdir build && cd build
-        MPI_HOME=/usr/local/mpi /tmp/cmake-3.26.4-linux-x86_64/bin/cmake -DCMAKE_BUILD_TYPE=Release ..
+        cmake -DCMAKE_BUILD_TYPE=Release -DBYPASS_GPU_CHECK=ON -DUSE_CUDA=ON ..
         make -j
       workingDirectory: '$(System.DefaultWorkingDirectory)'
 
@@ -112,3 +110,15 @@ jobs:
         set -e
         python3 test/mscclpp-test/check_perf_result.py --perf-file output.jsonl --baseline-file test/deploy/perf_ndmv4.jsonl
       workingDirectory: '$(System.DefaultWorkingDirectory)'
+
+  - task: Bash@3
+    name: PythonAllReduceBenchmark
+    displayName: Python Allreduce Benchmark
+    inputs:
+      targetType: 'inline'
+      script: |
+        set -e
+        export PATH=/usr/local/mpi/bin:$PATH
+        python3 -m pip install .
+        mpirun -tag-output -x MSCCLPP_HOME=$(System.DefaultWorkingDirectory) -np 8 python3 ./python/mscclpp_benchmark/allreduce_bench.py
+      workingDirectory: '$(System.DefaultWorkingDirectory)'
diff --git a/.azure-pipelines/multi-nodes-test.yml b/.azure-pipelines/multi-nodes-test.yml
@@ -10,9 +10,9 @@ jobs:
   strategy:
     matrix:
       cuda11:
-        containerImage: ghcr.io/microsoft/mscclpp/mscclpp:base-cuda11.8
+        containerImage: ghcr.io/microsoft/mscclpp/mscclpp:base-dev-cuda11.8
       cuda12:
-        containerImage: ghcr.io/microsoft/mscclpp/mscclpp:base-cuda12.1
+        containerImage: ghcr.io/microsoft/mscclpp/mscclpp:base-dev-cuda12.2
   pool:
     name: mscclpp-it
   container:
@@ -25,12 +25,9 @@ jobs:
     inputs:
       targetType: 'inline'
       script: |
-        curl -L https://github.com/Kitware/CMake/releases/download/v3.26.4/cmake-3.26.4-linux-x86_64.tar.gz -o /tmp/cmake-3.26.4-linux-x86_64.tar.gz
-        tar xzf /tmp/cmake-3.26.4-linux-x86_64.tar.gz -C /tmp
         mkdir build && cd build
-        MPI_HOME=/usr/local/mpi /tmp/cmake-3.26.4-linux-x86_64/bin/cmake -DCMAKE_BUILD_TYPE=Release -DBYPASS_PEERMEM_CHECK=ON ..
+        cmake -DCMAKE_BUILD_TYPE=Release -DBYPASS_GPU_CHECK=ON -DUSE_CUDA=ON ..
         make -j
-        make pylib-copy
       workingDirectory: '$(System.DefaultWorkingDirectory)'
 
   - task: DownloadSecureFile@1
@@ -83,7 +80,7 @@ jobs:
         tail -f output/mscclit-000000 &
         CHILD_PID=$!
         parallel-ssh -t 0 -H mscclit-000000 -l azureuser -x "-i ${KeyFilePath}" \
-        -O $SSH_OPTION -o output 'sudo docker exec -t mscclpp-test bash /root/mscclpp/run_tests.sh mscclpp-test'
+        -O $SSH_OPTION -o output 'sudo docker exec -t mscclpp-test bash /root/mscclpp/test/deploy/run_tests.sh mscclpp-test'
         kill $CHILD_PID
 
   - task: Bash@3
@@ -102,7 +99,7 @@ jobs:
         tail -f output/mscclit-000000 &
         CHILD_PID=$!
         parallel-ssh -t 0 -H mscclit-000000 -l azureuser -x "-i ${KeyFilePath}" \
-        -O $SSH_OPTION -o output 'sudo docker exec -t mscclpp-test bash /root/mscclpp/run_tests.sh mp-ut'
+        -O $SSH_OPTION -o output 'sudo docker exec -t mscclpp-test bash /root/mscclpp/test/deploy/run_tests.sh mp-ut'
         kill $CHILD_PID
 
   - task: Bash@3
@@ -121,7 +118,26 @@ jobs:
         tail -f output/mscclit-000000 &
         CHILD_PID=$!
         parallel-ssh -t 0 -H mscclit-000000 -l azureuser -x "-i ${KeyFilePath}" \
-        -O $SSH_OPTION -o output 'sudo docker exec -t mscclpp-test bash /root/mscclpp/run_tests.sh pytests'
+        -O $SSH_OPTION -o output 'sudo docker exec -t mscclpp-test bash /root/mscclpp/test/deploy/run_tests.sh pytests'
+        kill $CHILD_PID
+
+  - task: Bash@3
+    name: RunMultiNodePythonBenchmark
+    displayName: Run multi-nodes python benchmark
+    inputs:
+      targetType: 'inline'
+      script: |
+        set -e
+        HOSTFILE=$(System.DefaultWorkingDirectory)/test/mscclpp-test/deploy/hostfile
+        SSH_OPTION="StrictHostKeyChecking=no"
+        KeyFilePath=${SSHKEYFILE_SECUREFILEPATH}
+        rm -rf output/*
+        mkdir -p output
+        touch output/mscclit-000000
+        tail -f output/mscclit-000000 &
+        CHILD_PID=$!
+        parallel-ssh -t 0 -H mscclit-000000 -l azureuser -x "-i ${KeyFilePath}" \
+        -O $SSH_OPTION -o output 'sudo docker exec -t mscclpp-test bash /root/mscclpp/test/deploy/run_tests.sh py-benchmark'
         kill $CHILD_PID
 
   - task: AzureCLI@2

diff --git a/.azure-pipelines/ut.yml b/.azure-pipelines/ut.yml
@@ -15,9 +15,9 @@ jobs:
   strategy:
     matrix:
       cuda11:
-        containerImage: ghcr.io/microsoft/mscclpp/mscclpp:base-cuda11.8
+        containerImage: ghcr.io/microsoft/mscclpp/mscclpp:base-dev-cuda11.8
       cuda12:
-        containerImage: ghcr.io/microsoft/mscclpp/mscclpp:base-cuda12.1
+        containerImage: ghcr.io/microsoft/mscclpp/mscclpp:base-dev-cuda12.2
 
   container:
     image: $[ variables['containerImage'] ]
@@ -30,10 +30,8 @@ jobs:
     inputs:
       targetType: 'inline'
       script: |
-        curl -L -C- https://github.com/Kitware/CMake/releases/download/v3.26.4/cmake-3.26.4-linux-x86_64.tar.gz -o /tmp/cmake-3.26.4-linux-x86_64.tar.gz
-        tar xzf /tmp/cmake-3.26.4-linux-x86_64.tar.gz -C /tmp
         mkdir build && cd build
-        MPI_HOME=/usr/local/mpi /tmp/cmake-3.26.4-linux-x86_64/bin/cmake -DCMAKE_BUILD_TYPE=Release ..
+        cmake -DCMAKE_BUILD_TYPE=Release ..
         make -j
       workingDirectory: '$(System.DefaultWorkingDirectory)'
 
@@ -79,11 +77,5 @@ jobs:
       script: |
         set -e
         export PATH=/usr/local/mpi/bin:$PATH
-        cd build && make pylib-copy
-        if [[ '$(containerImage)' == *'cuda11'* ]]; then
-          pip3 install -r ../python/test/requirements_cu11.txt
-        else
-          pip3 install -r ../python/test/requirements_cu12.txt
-        fi
-        mpirun -tag-output -np 8 ~/.local/bin/pytest ../python/test/test_mscclpp.py -x
+        mpirun -tag-output -x MSCCLPP_HOME=$(System.DefaultWorkingDirectory) -np 8 python3 -m pytest ./python/test/test_mscclpp.py -x
       workingDirectory: '$(System.DefaultWorkingDirectory)'
diff --git a/.github/ISSUE_TEMPLATE/documentation-improvement.md b/.github/ISSUE_TEMPLATE/documentation-improvement.md
@@ -0,0 +1,10 @@
+---
+name: Documentation improvement
+about: Enhance or fix documentation
+title: "[Doc]"
+labels: ''
+assignees: ''
+
+---
+
+
diff --git a/.github/workflows/codeql-analysis.yml b/.github/workflows/codeql-analysis.yml
@@ -13,7 +13,7 @@ jobs:
     name: Analyze
     runs-on: 'ubuntu-latest'
     container:
-      image: ghcr.io/microsoft/mscclpp/mscclpp:dev-${{ matrix.cuda-version }}
+      image: ghcr.io/microsoft/mscclpp/mscclpp:base-dev-${{ matrix.cuda-version }}
 
     permissions:
       actions: read
@@ -24,7 +24,7 @@ jobs:
       fail-fast: false
       matrix:
         language: [ 'cpp', 'python' ]
-        cuda-version: [ 'cuda11.8', 'cuda12.1' ]
+        cuda-version: [ 'cuda11.8', 'cuda12.2' ]
 
     steps:
     - name: Checkout repository
@@ -45,7 +45,7 @@ jobs:
 
     - name: Build
       run: |
-        MPI_HOME=/usr/local/mpi cmake -DBYPASS_PEERMEM_CHECK=ON .
+        cmake -DBYPASS_GPU_CHECK=ON -DUSE_CUDA=ON .
         make -j
 
     - name: Perform CodeQL Analysis

diff --git a/.github/workflows/integration-test-backup.yml b/.github/workflows/integration-test-backup.yml
@@ -10,10 +10,10 @@ jobs:
         shell: bash
     strategy:
       matrix:
-        cuda: [ cuda11.8, cuda12.1 ]
+        cuda: [ cuda11.8, cuda12.2 ]
 
     container:
-      image: "ghcr.io/microsoft/mscclpp/mscclpp:dev-${{ matrix.cuda }}"
+      image: "ghcr.io/microsoft/mscclpp/mscclpp:base-dev-${{ matrix.cuda }}"
       options: --privileged --ipc=host --gpus=all --ulimit memlock=-1:-1
 
     steps:
@@ -23,7 +23,7 @@ jobs:
       - name: Build
         run: |
           mkdir build && cd build
-          MPI_HOME=/usr/local/mpi cmake -DCMAKE_BUILD_TYPE=Release ..
+          cmake -DCMAKE_BUILD_TYPE=Release ..
           make -j
 
       - name: Lock GPU clock frequency

diff --git a/.github/workflows/lint.yml b/.github/workflows/lint.yml
@@ -20,7 +20,7 @@ jobs:
 
     - name: Run cpplint
       run: |
-        CPPSOURCES=$(find ./ -regextype posix-extended -regex '.*\.(c|cpp|h|hpp|cc|cxx|cu)' -not -path "./build/*")
+        CPPSOURCES=$(find ./src ./include ./python ./test -regextype posix-extended -regex '.*\.(c|cpp|h|hpp|cc|cxx|cu)')
         clang-format -style=file --verbose --Werror --dry-run ${CPPSOURCES}
 
   pylint:

diff --git a/.github/workflows/ut-backup.yml b/.github/workflows/ut-backup.yml
@@ -11,10 +11,10 @@ jobs:
     timeout-minutes: 30
     strategy:
       matrix:
-        cuda: [ cuda11.8, cuda12.1 ]
+        cuda: [ cuda11.8, cuda12.2 ]
 
     container:
-      image: "ghcr.io/microsoft/mscclpp/mscclpp:dev-${{ matrix.cuda }}"
+      image: "ghcr.io/microsoft/mscclpp/mscclpp:base-dev-${{ matrix.cuda }}"
       options: --privileged --ipc=host --gpus=all --ulimit memlock=-1:-1
 
     steps:
@@ -29,7 +29,7 @@ jobs:
       - name: Build
         run: |
           mkdir build && cd build
-          MPI_HOME=/usr/local/mpi cmake -DCMAKE_BUILD_TYPE=Release ..
+          cmake -DCMAKE_BUILD_TYPE=Release ..
           make -j
         working-directory: ${{ github.workspace }}
 
@@ -54,11 +54,11 @@ jobs:
       - name: PyTests
         run: |
           set -e
-          cd build && make pylib-copy
-          mpirun --allow-run-as-root -tag-output -np 8 $(which pytest) ../python/test/test_mscclpp.py -x
+          mpirun --allow-run-as-root -tag-output -np 8 $(which pytest) ./python/test/test_mscclpp.py -x
 
       - name: ReportCoverage
         run: |
+          set -e
           cd build
           lcov --capture --directory . --output-file coverage.info
           lcov --remove coverage.info \
@@ -68,4 +68,4 @@ jobs:
               '*/test/*' \
               '*/tools/*' \
               --output-file coverage.info
-          lcov --list coverage.info
+          lcov --list coverage.info
diff --git a/CITATION.cff b/CITATION.cff
@@ -0,0 +1,47 @@
+cff-version: 1.2.0
+title: "MSCCL++: A GPU-driven communication stack for scalable AI applications"
+version: 0.4.2
+message: >-
+  If you use this project in your research, please cite it as below.
+authors:
+  - given-names: Peng
+    family-names: Cheng
+    affiliation: Microsoft Research
+  - given-names: Changho
+    family-names: Hwang
+    affiliation: Microsoft Research
+  - given-names: Abhinav
+    family-names: Jangda
+    affiliation: Microsoft Research
+  - given-names: Suriya
+    family-names: Kalivardhan
+    affiliation: Microsoft Azure
+  - given-names: Binyang
+    family-names: Li
+    affiliation: Microsoft Azure
+  - given-names: Shuguang
+    family-names: Liu
+    affiliation: Microsoft Azure
+  - given-names: Saeed
+    family-names: Maleki
+    affiliation: Microsoft Research
+  - given-names: Madan
+    family-names: Musuvathi
+    affiliation: Microsoft Research
+  - given-names: Olli
+    family-names: Saarikivi
+    affiliation: Microsoft Research
+  - given-names: Wei
+    family-names: Tsui
+    affiliation: Microsoft Research
+  - given-names: Ziyue
+    family-names: Yang
+    affiliation: Microsoft Research
+
+repository-code: 'https://github.com/microsoft/mscclpp'
+abstract: >-
+  MSCCL++ redefines the interface for inter-GPU communication, thereby
+  delivering a highly efficient and customizable communication stack
+  tailored for distributed GPU applications.
+license: MIT
+license-url: https://github.com/microsoft/mscclpp/blob/main/LICENSE