Merge branch 'sgl-project:main' into main

stbaione · Nov 6, 2024 · 214a60b · 214a60b
2 parents e144533 + a5e0def
commit 214a60b
Show file tree

Hide file tree

Showing 170 changed files with 5,657 additions and 3,356 deletions.
diff --git a/.editorconfig b/.editorconfig
@@ -0,0 +1,25 @@
+# https://editorconfig.org/
+
+root = true
+
+[*]
+charset = utf-8
+end_of_line = lf
+indent_style = space
+indent_size = 4
+trim_trailing_whitespace = true
+insert_final_newline = true
+
+[*.{json,yaml,yml}]
+indent_size = 2
+
+[*.md]
+indent_size = 2
+x-soft-wrap-text = true
+
+[*.rst]
+indent_size = 4
+x-soft-wrap-text = true
+
+[Makefile]
+indent_style = tab
diff --git a/.github/CODEOWNERS b/.github/CODEOWNERS
@@ -11,3 +11,4 @@
 /python/sglang/srt/sampling @merrymercy @hnyls2002
 /test/lang @merrymercy @Ying1123 @ByronHsu
 /test/srt @merrymercy @Ying1123 @zhyncs
+/rust @ByronHsu @Ying1123
diff --git a/.github/pull_request_template.md b/.github/pull_request_template.md
@@ -12,4 +12,4 @@
 
 - [ ] Format your code according to the [Contributor Guide](https://github.com/sgl-project/sglang/blob/main/docs/contributor_guide.md).
 - [ ] Add unit tests as outlined in the [Contributor Guide](https://github.com/sgl-project/sglang/blob/main/docs/contributor_guide.md).
-- [ ] Update documentation as needed, including docstrings or example tutorials.
+- [ ] Update documentation as needed, including docstrings or example tutorials.
diff --git a/.github/workflows/close-inactive-issues.yml b/.github/workflows/close-inactive-issues.yml
@@ -20,10 +20,10 @@ jobs:
           github-token: ${{secrets.GITHUB_TOKEN}}
           script: |
             const sixtyDaysAgo = new Date(Date.now() - 60 * 24 * 60 * 60 * 1000);
-            
+
             const [owner, repo] = process.env.GITHUB_REPOSITORY.split('/');
             console.log(`Owner: ${owner}, Repo: ${repo}`);
-            
+
             async function fetchIssues(page = 1) {
               console.log(`Fetching issues for ${owner}/${repo}, page ${page}`);
               return await github.rest.issues.listForRepo({
@@ -36,23 +36,23 @@ jobs:
                 page: page
               });
             }
-            
+
             async function processIssues() {
               console.log('Starting to process issues');
               console.log(`Repository: ${owner}/${repo}`);
-              
+
               let page = 1;
               let hasMoreIssues = true;
               while (hasMoreIssues) {
                 try {
                   const issues = await fetchIssues(page);
                   console.log(`Fetched ${issues.data.length} issues on page ${page}`);
-                  
+
                   if (issues.data.length === 0) {
                     hasMoreIssues = false;
                     break;
                   }
-                  
+
                   for (const issue of issues.data) {
                     if (new Date(issue.updated_at) < sixtyDaysAgo) {
                       try {
@@ -87,5 +87,5 @@ jobs:
               }
               console.log('Finished processing issues');
             }
-            
+
             await processIssues();
diff --git a/.github/workflows/execute-notebook.yml b/.github/workflows/execute-notebook.yml
@@ -18,7 +18,7 @@ concurrency:
   group: execute-notebook-${{ github.ref }}
   cancel-in-progress: true
 
-  
+
 jobs:
   run-all-notebooks:
     runs-on: 1-gpu-runner
@@ -44,11 +44,5 @@ jobs:
       - name: Execute notebooks
         run: |
           cd docs
-          for nb in *.ipynb; do
-            if [ -f "$nb" ]; then
-              echo "Executing $nb"
-              jupyter nbconvert --to notebook --execute --inplace "$nb" \
-                --ExecutePreprocessor.timeout=600 \
-                --ExecutePreprocessor.kernel_name=python3
-            fi
-          done
+          make clean
+          make compile
diff --git a/.github/workflows/nightly-eval.yml b/.github/workflows/nightly-eval.yml
@@ -25,9 +25,11 @@ jobs:
       - name: Install dependencies
         run: |
           bash scripts/ci_install_dependency.sh
+          pip install --upgrade "evalplus[vllm] @ git+https://github.com/evalplus/evalplus"
 
-      - name: Nightly gsm8k Accuracy
-        timeout-minutes: 60
+      - name: Nightly gsm8k and human eval Accuracy
+        timeout-minutes: 120
         run: |
           cd test/srt
+          python3 test_nightly_human_eval.py
           python3 test_nightly_gsm8k_eval.py
diff --git a/.github/workflows/pr-test-rust.yml b/.github/workflows/pr-test-rust.yml
@@ -0,0 +1,39 @@
+name: PR Test (Rust)
+
+on:
+  push:
+    branches: [ main ]
+    paths:
+      - "rust/**"
+  pull_request:
+    branches: [ main ]
+    paths:
+      - "rust/**"
+  workflow_dispatch:
+
+concurrency:
+  group: pr-test-rust-${{ github.ref }}
+  cancel-in-progress: true
+
+jobs:
+  unit-test-rust:
+    if: github.repository == 'sgl-project/sglang' || github.event_name == 'pull_request'
+    runs-on: 1-gpu-runner
+    steps:
+      - name: Checkout code
+        uses: actions/checkout@v3
+
+      - name: Install dependencies
+        run: |
+          bash scripts/ci_install_rust.sh
+      - name: Run fmt
+        run: |
+          source "$HOME/.cargo/env"
+          cd rust/
+          cargo fmt -- --check
+      - name: Run test
+        timeout-minutes: 20
+        run: |
+          source "$HOME/.cargo/env"
+          cd rust/
+          cargo test
diff --git a/.github/workflows/pr-test.yml b/.github/workflows/pr-test.yml
@@ -50,7 +50,7 @@ jobs:
         timeout-minutes: 20
         run: |
           cd test/srt
-          python3 run_suite.py --suite minimal --range-begin 0 --range-end 5
+          python3 run_suite.py --suite minimal --range-begin 0 --range-end 6
 
   unit-test-backend-part-2:
     if: github.repository == 'sgl-project/sglang' || github.event_name == 'pull_request'
@@ -67,7 +67,7 @@ jobs:
         timeout-minutes: 20
         run: |
           cd test/srt
-          python3 run_suite.py --suite minimal --range-begin 5 --range-end 17
+          python3 run_suite.py --suite minimal --range-begin 6 --range-end 14
 
   unit-test-backend-part-3:
     if: github.repository == 'sgl-project/sglang' || github.event_name == 'pull_request'
@@ -84,7 +84,7 @@ jobs:
         timeout-minutes: 20
         run: |
           cd test/srt
-          python3 run_suite.py --suite minimal --range-begin 17 --range-end 20
+          python3 run_suite.py --suite minimal --range-begin 14 --range-end 20
 
   unit-test-backend-part-4:
     if: github.repository == 'sgl-project/sglang' || github.event_name == 'pull_request'
@@ -237,7 +237,7 @@ jobs:
         run: |
           cd test/srt
           python3 test_moe_eval_accuracy_large.py
-      
+
       - name: Evaluate MLA Accuracy (TP=2)
         timeout-minutes: 10
         run: |

diff --git a/.github/workflows/deploy-docs.yml → .github/workflows/release-docs.yml b/.github/workflows/deploy-docs.yml → .github/workflows/release-docs.yml
@@ -9,6 +9,10 @@ on:
       - 'python/sglang/version.py'
   workflow_dispatch:
 
+concurrency:
+  group: release-docs-${{ github.ref }}
+  cancel-in-progress: true
+
 jobs:
   execute-and-deploy:
     runs-on: 1-gpu-runner
@@ -38,20 +42,16 @@ jobs:
           GITHUB_TOKEN: ${{ secrets.PAT_TOKEN }}
         run: |
           cd docs
-          for nb in *.ipynb; do
-            if [ -f "$nb" ]; then
-              echo "Executing $nb"
-              jupyter nbconvert --to notebook --execute --inplace "$nb" \
-                --ExecutePreprocessor.timeout=600 \
-                --ExecutePreprocessor.kernel_name=python3
-            fi
-          done
+          make clean
+          make compile
 
           make html
           cd _build/html
-          
-          git clone https://[email protected]/sgl-project/sgl-project.github.io.git ../sgl-project.github.io
+
+          git clone https://[email protected]/sgl-project/sgl-project.github.io.git ../sgl-project.github.io --depth 1
+          rm -rf  ../sgl-project.github.io/*
           cp -r * ../sgl-project.github.io
+          cp ../../README.md ../sgl-project.github.io/README.md
           cd ../sgl-project.github.io
           git config user.name "zhaochenyang20"
           git config user.email "[email protected]"

diff --git a/.gitignore b/.gitignore
@@ -185,4 +185,4 @@ tmp*.txt
 work_dirs/
 *.csv
 
-!logo.png
+!logo.png
diff --git a/.isort.cfg b/.isort.cfg
@@ -1,3 +1,3 @@
 [settings]
 profile=black
-known_first_party=sglang
+known_first_party=sglang
diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
@@ -1,7 +1,27 @@
 default_language_version:
     python: python3.9
 
+default_stages: [pre-commit, pre-push, manual]
+
 repos:
+  - repo: https://github.com/pre-commit/pre-commit-hooks
+    rev: v5.0.0
+    hooks:
+      - id: check-symlinks
+      - id: destroyed-symlinks
+      - id: trailing-whitespace
+      - id: end-of-file-fixer
+      - id: check-yaml
+        args: [--allow-multiple-documents]
+      - id: check-toml
+      - id: check-ast
+      - id: check-added-large-files
+      - id: check-merge-conflict
+      - id: check-executables-have-shebangs
+      - id: check-shebang-scripts-are-executable
+      - id: detect-private-key
+      - id: debug-statements
+      - id: no-commit-to-branch
   - repo: https://github.com/PyCQA/isort
     rev: 5.13.2
     hooks:
@@ -13,8 +33,3 @@ repos:
         additional_dependencies: ['.[jupyter]']
         types: [python, jupyter]
         types_or: [python, jupyter]
-
-  - repo: https://github.com/pre-commit/pre-commit-hooks
-    rev: v5.0.0
-    hooks:
-      - id: no-commit-to-branch
diff --git a/3rdparty/amd/profiling/PROFILING.md b/3rdparty/amd/profiling/PROFILING.md
@@ -6,5 +6,3 @@ Two primary methods are covered:
 
 
 - [Torch Profiler](https://pytorch.org/tutorials/recipes/recipes/profiler_recipe.html)
-
-
diff --git a/3rdparty/amd/tuning/TUNING.md b/3rdparty/amd/tuning/TUNING.md
@@ -2,12 +2,100 @@
 This AppNote describes the SGLang performance tuning technical, code harness and running steps for systems with AMD Instinct GPUs.
 Harness code, examples and steps are provided in detail, to facilitate easy reproduce & use to tune performance towards workloads.
 Three primary runtime areas are covered:
-- Triton Kernels
 
+## 1. Triton Kernels
+To maximize Triton kernel efficiency, several strategies can be employed:
 
-- Torch Tunable Ops 
+### Key Environment Variables:
+- **num_stages**: Adjusts the number of pipeline stages to optimize kernel efficiency based on the specific type of operations (e.g., General Matrix Multiplication - GEMM).
+- **waves_per_eu**: Controls the usage of Vector General Purpose Registers (VGPR) to enhance occupancy, thereby improving latency or throughput.
+- **BLOCK_M, BLOCK_N, BLOCK_K**: Tunable tile sizes that assist in balancing memory transfer and computational efficiency.
+- **matrix_instr_nonkdim**: Optimizes the usage of Matrix-Fused Multiply-Add (MFMA) instructions for specific kernel types, such as Flash Attention.
+- **OPTIMIZE_EPILOGUE**: An environment variable that can be set to `1` to enhance performance by eliminating the `convert_layout` operation in the kernel's epilogue.
+```python
+@triton.autotune(configs=[
+        triton.Config({'waves_per_eu': 1}, num_warps=4, num_stages=1),
+        triton.Config({'waves_per_eu': 1}, num_warps=8, num_stages=1),
+        triton.Config({'waves_per_eu': 1}, num_warps=16, num_stages=1),
+        triton.Config({'waves_per_eu': 2}, num_warps=4, num_stages=1),
+        triton.Config({'waves_per_eu': 2}, num_warps=8, num_stages=1),
+        triton.Config({'waves_per_eu': 2}, num_warps=16, num_stages=1),
+        triton.Config({'waves_per_eu': 4}, num_warps=4, num_stages=1),
+        triton.Config({'waves_per_eu': 4}, num_warps=8, num_stages=1),
+        triton.Config({'waves_per_eu': 4}, num_warps=16, num_stages=1),
+    ], key=['BLOCK_N', 'NUM_TOKEN_BLKS'], use_cuda_graph=True)
+@triton.jit
+def _triton_kernel_funtion():
+    ...
+```
+## 2. Torch Tunable Operations
+**TunableOp** is a feature in PyTorch that allows for the definition and optimization of custom kernels with tunable parameters. This feature is particularly useful for enhancing the performance of kernels by experimenting with different configurations.
 
+### Key Environment Variables:
+1. **PYTORCH_TUNABLEOP_ENABLED**:
+   - Default: `0`
+   - Set to `1` to enable TunableOp.
 
-- Torch Compile
+2. **PYTORCH_TUNABLEOP_TUNING**:
+   - Default: `1`
+   - Set to `0` to disable tuning. If a tuned entry is not found, it will run the tuning step and record the entry when PYTORCH_TUNABLEOP_ENABLED is enabled.
 
+3. **PYTORCH_TUNABLEOP_VERBOSE**:
+   - Default: `0`
+   - Set to `1` to enable verbose output for TunableOp.
 
+### Usage Example:
+To enable TunableOp and tuning, and optionally enable verbose mode, you can run the following command in your terminal:
+
+```bash
+#Tuning
+PYTORCH_TUNABLEOP_ENABLED=1 PYTORCH_TUNABLEOP_TUNING=1 your_script.sh
+
+#Inference with tuning op
+PYTORCH_TUNABLEOP_ENABLED=1 PYTORCH_TUNABLEOP_TUNING=0 your_script.sh
+
+#Print out the log
+PYTORCH_TUNABLEOP_ENABLED=1 PYTORCH_TUNABLEOP_TUNING=0 PYTORCH_TUNABLEOP_VERBOSE=1 your_script.sh
+
+```
+## 3. Torch Compilation
+
+
+The following are suggestions for optimizing matrix multiplication (GEMM) and convolution (conv) operations in PyTorch using Inductor, a part of the PyTorch compilation framework. The goal is to leverage Triton to achieve better performance.
+
+To tune Triton kernels with GEMM and convolution ops (conv), use the `torch.compile` function with the max-autotune mode. This benchmarks a predefined list of Triton configurations and selects the fastest one for each shape.
+
+### Key Configurations:
+1. **Max Autotune**:
+   - Set `torch._inductor.config.max_autotune = True` or `TORCHINDUCTOR_MAX_AUTOTUNE=1`.
+
+2. **Fine-Grained Control**:
+   - Enable GEMM tuning: `torch._inductor.config.max_autotune_gemm = True`.
+   - Enable tuning for pointwise/reduction ops: `torch._inductor.config.max_autotune.pointwise = True`.
+
+3. **Backend Selection**:
+   - Use `torch._inductor.max_autotune_gemm_backends` to limit backends to TRITON for better performance.
+
+4. **Freezing for Inference**:
+   - Use `torch._inductor.config.freezing=True` to enable constant folding optimizations.
+
+5. **Debugging**:
+   - Set `TORCH_COMPILE_DEBUG=1` to extract Triton kernels generated by Inductor.
+
+### Example Code Block:
+```bash
+#Gemm Tuning
+TORCHINDUCTOR_MAX_AUTOTUNE=1 TORCHINDUCTOR_COORDINATE_DESCENT_TUNING=1 your_script.sh
+
+#Specify your backend to TRITON for Gemm Tuning
+TORCHINDUCTOR_MAX_AUTOTUNE=1 TORCHINDUCTOR_COORDINATE_DESCENT_TUNING=1 TORCHINDUCTOR_MAX_AUTOTUNE_GEMM_BACKENDS=TRITON your_script.sh
+
+#Inference with large improvement on AMD GPU
+TORCHINDUCTOR_FREEZING=1 your_script.sh
+```
+
+## Reference
+
+For more detailed information on tuning SGLang performance with AMD GPUs, please refer to the following link:
+
+[ROCm Documentation: Triton Kernel Performance Optimization](https://rocm.docs.amd.com/en/latest/how-to/tuning-guides/mi300x/workload.html#triton-kernel-performance-optimization)
Original file line number	Diff line number	Diff line change
Expand Up		@@ -6,5 +6,3 @@ Two primary methods are covered:


		- [Torch Profiler](https://pytorch.org/tutorials/recipes/recipes/profiler_recipe.html)