From 9d9f0d309edd03fa3e5ddb7f7f6a6dd4ee6b87ef Mon Sep 17 00:00:00 2001
From: Stephen Baione <109226581+stbaione@users.noreply.github.com>
Date: Mon, 25 Nov 2024 09:01:55 -0600
Subject: [PATCH 1/3] Move SGLang related tests (#601)

Split from this PR: https://github.com/nod-ai/shark-ai/pull/590

We have too many tests running on `mi300x-3` and need to move the SGLang
related ones to `mi300x-4`.

This PR moves the workflows for `sglang_integration_tests` and
`sglang_benchmark_tests` to mi300x-4, along with removing the assumption
of static MODEL_PATH and TOKENIZER_PATH, downloading them on demand
instead.
---
 .github/workflows/ci-sglang-benchmark.yml     |  4 +-
 .../workflows/ci-sglang-integration-tests.yml |  2 +-
 .../llm/sglang_benchmarks/__init__.py         |  5 +++
 .../llm/{ => sglang_benchmarks}/conftest.py   | 16 ++++++--
 .../sglang_benchmark_test.py                  | 41 ++++++++-----------
 .../llm/{ => sglang_benchmarks}/utils.py      | 13 ++++++
 app_tests/integration_tests/llm/utils.py      |  2 +-
 7 files changed, 52 insertions(+), 31 deletions(-)
 create mode 100644 app_tests/benchmark_tests/llm/sglang_benchmarks/__init__.py
 rename app_tests/benchmark_tests/llm/{ => sglang_benchmarks}/conftest.py (74%)
 rename app_tests/benchmark_tests/llm/{ => sglang_benchmarks}/sglang_benchmark_test.py (76%)
 rename app_tests/benchmark_tests/llm/{ => sglang_benchmarks}/utils.py (84%)

diff --git a/.github/workflows/ci-sglang-benchmark.yml b/.github/workflows/ci-sglang-benchmark.yml
index 504e7e5e3..f44e2772b 100644
--- a/.github/workflows/ci-sglang-benchmark.yml
+++ b/.github/workflows/ci-sglang-benchmark.yml
@@ -28,7 +28,7 @@ jobs:
       matrix:
         version: [3.11]
       fail-fast: false
-    runs-on: llama-mi300x-3
+    runs-on: mi300x-4
     defaults:
       run:
         shell: bash
@@ -78,7 +78,7 @@ jobs:
         run: pip install "git+https://github.com/nod-ai/sglang.git#subdirectory=python"
 
       - name: Launch Shortfin Server
-        run: pytest -v app_tests/benchmark_tests/llm/sglang_benchmark_test.py --log-cli-level=INFO --html=out/llm/sglang/index.html
+        run: pytest -v app_tests/benchmark_tests/llm/sglang_benchmarks/sglang_benchmark_test.py --log-cli-level=INFO --html=out/llm/sglang/index.html
 
       - name: Deploy to GitHub Pages
         uses: peaceiris/actions-gh-pages@4f9cc6602d3f66b9c108549d475ec49e8ef4d45e # v4.0.0
diff --git a/.github/workflows/ci-sglang-integration-tests.yml b/.github/workflows/ci-sglang-integration-tests.yml
index 1c382617d..c61756d78 100644
--- a/.github/workflows/ci-sglang-integration-tests.yml
+++ b/.github/workflows/ci-sglang-integration-tests.yml
@@ -29,7 +29,7 @@ jobs:
       matrix:
         version: [3.11]
       fail-fast: false
-    runs-on: llama-mi300x-3
+    runs-on: mi300x-4
     defaults:
       run:
         shell: bash
diff --git a/app_tests/benchmark_tests/llm/sglang_benchmarks/__init__.py b/app_tests/benchmark_tests/llm/sglang_benchmarks/__init__.py
new file mode 100644
index 000000000..a85ba359d
--- /dev/null
+++ b/app_tests/benchmark_tests/llm/sglang_benchmarks/__init__.py
@@ -0,0 +1,5 @@
+# Copyright 2024 Advanced Micro Devices, Inc.
+#
+# Licensed under the Apache License v2.0 with LLVM Exceptions.
+# See https://llvm.org/LICENSE.txt for license information.
+# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
diff --git a/app_tests/benchmark_tests/llm/conftest.py b/app_tests/benchmark_tests/llm/sglang_benchmarks/conftest.py
similarity index 74%
rename from app_tests/benchmark_tests/llm/conftest.py
rename to app_tests/benchmark_tests/llm/sglang_benchmarks/conftest.py
index cc354b7eb..1e1c64b24 100644
--- a/app_tests/benchmark_tests/llm/conftest.py
+++ b/app_tests/benchmark_tests/llm/sglang_benchmarks/conftest.py
@@ -9,15 +9,22 @@
 import pytest
 import sys
 
-sys.path.append(os.path.abspath(os.path.join(os.path.dirname(__file__), "..", "..")))
-from integration_tests.llm.utils import compile_model, export_paged_llm_v1
+sys.path.append(
+    os.path.abspath(os.path.join(os.path.dirname(__file__), "..", "..", ".."))
+)
+from integration_tests.llm.utils import (
+    compile_model,
+    export_paged_llm_v1,
+    download_with_hf_datasets,
+)
 
 
 @pytest.fixture(scope="module")
 def pre_process_model(request, tmp_path_factory):
     tmp_dir = tmp_path_factory.mktemp("sglang_benchmark_test")
 
-    model_path = request.param["model_path"]
+    model_name = request.param["model_name"]
+    model_param_file_name = request.param["model_param_file_name"]
     settings = request.param["settings"]
     batch_sizes = request.param["batch_sizes"]
 
@@ -25,6 +32,9 @@ def pre_process_model(request, tmp_path_factory):
     config_path = tmp_dir / "config.json"
     vmfb_path = tmp_dir / "model.vmfb"
 
+    model_path = tmp_dir / model_param_file_name
+    download_with_hf_datasets(tmp_dir, model_name)
+
     export_paged_llm_v1(mlir_path, config_path, model_path, batch_sizes)
 
     config = {
diff --git a/app_tests/benchmark_tests/llm/sglang_benchmark_test.py b/app_tests/benchmark_tests/llm/sglang_benchmarks/sglang_benchmark_test.py
similarity index 76%
rename from app_tests/benchmark_tests/llm/sglang_benchmark_test.py
rename to app_tests/benchmark_tests/llm/sglang_benchmarks/sglang_benchmark_test.py
index 0de775795..b66904570 100644
--- a/app_tests/benchmark_tests/llm/sglang_benchmark_test.py
+++ b/app_tests/benchmark_tests/llm/sglang_benchmarks/sglang_benchmark_test.py
@@ -4,7 +4,6 @@
 # See https://llvm.org/LICENSE.txt for license information.
 # SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
 
-import json
 import logging
 import multiprocessing
 import os
@@ -16,14 +15,14 @@
 pytest.importorskip("sglang")
 from sglang import bench_serving
 
-from utils import SGLangBenchmarkArgs
+from .utils import SGLangBenchmarkArgs, log_jsonl_result
 
 from integration_tests.llm.utils import (
     find_available_port,
     start_llm_server,
 )
 
-logger = logging.getLogger("__name__")
+logger = logging.getLogger(__name__)
 
 device_settings = {
     "device_flags": [
@@ -33,30 +32,21 @@
     "device": "hip",
 }
 
-# TODO: Download on demand instead of assuming files exist at this path
-MODEL_PATH = Path("/data/llama3.1/8b/llama8b_f16.irpa")
-TOKENIZER_DIR = Path("/data/llama3.1/8b/")
-
-
-def log_jsonl_result(file_path):
-    with open(file_path, "r") as file:
-        json_string = file.readline().strip()
-
-    json_data = json.loads(json_string)
-    for key, val in json_data.items():
-        logger.info(f"{key.upper()}: {val}")
-
 
 @pytest.mark.parametrize(
-    "request_rate",
-    [1, 2, 4, 8, 16, 32],
+    "request_rate,model_param_file_name",
+    [
+        (req_rate, "meta-llama-3.1-8b-instruct.f16.gguf")
+        for req_rate in [1, 2, 4, 8, 16, 32]
+    ],
 )
 @pytest.mark.parametrize(
     "pre_process_model",
     [
         (
             {
-                "model_path": MODEL_PATH,
+                "model_name": "llama3_8B_fp16",
+                "model_param_file_name": "meta-llama-3.1-8b-instruct.f16.gguf",
                 "settings": device_settings,
                 "batch_sizes": [1, 4],
             }
@@ -64,7 +54,9 @@ def log_jsonl_result(file_path):
     ],
     indirect=True,
 )
-def test_sglang_benchmark_server(request_rate, pre_process_model):
+def test_sglang_benchmark_server(
+    request_rate, model_param_file_name, pre_process_model
+):
     # TODO: Remove when multi-device is fixed
     os.environ["ROCR_VISIBLE_DEVICES"] = "1"
 
@@ -72,7 +64,8 @@ def test_sglang_benchmark_server(request_rate, pre_process_model):
 
     config_path = tmp_dir / "config.json"
     vmfb_path = tmp_dir / "model.vmfb"
-    tokenizer_path = TOKENIZER_DIR / "tokenizer.json"
+    tokenizer_path = tmp_dir / "tokenizer.json"
+    model_path = tmp_dir / model_param_file_name
 
     # Start shortfin llm server
     port = find_available_port()
@@ -81,7 +74,7 @@ def test_sglang_benchmark_server(request_rate, pre_process_model):
         tokenizer_path,
         config_path,
         vmfb_path,
-        MODEL_PATH,
+        model_path,
         device_settings,
         timeout=30,
     )
@@ -91,7 +84,7 @@ def test_sglang_benchmark_server(request_rate, pre_process_model):
         backend="shortfin",
         num_prompt=10,
         base_url=f"http://localhost:{port}",
-        tokenizer=TOKENIZER_DIR,
+        tokenizer=tmp_dir,
         request_rate=request_rate,
     )
     output_file = (
@@ -116,7 +109,7 @@ def test_sglang_benchmark_server(request_rate, pre_process_model):
         logger.info("======== RESULTS ========")
         log_jsonl_result(benchmark_args.output_file)
     except Exception as e:
-        logger.info(e)
+        logger.error(e)
 
     server_process.terminate()
     server_process.wait()
diff --git a/app_tests/benchmark_tests/llm/utils.py b/app_tests/benchmark_tests/llm/sglang_benchmarks/utils.py
similarity index 84%
rename from app_tests/benchmark_tests/llm/utils.py
rename to app_tests/benchmark_tests/llm/sglang_benchmarks/utils.py
index 55b01da04..47cea4d76 100644
--- a/app_tests/benchmark_tests/llm/utils.py
+++ b/app_tests/benchmark_tests/llm/sglang_benchmarks/utils.py
@@ -6,8 +6,12 @@
 
 from argparse import Namespace
 from dataclasses import dataclass
+import json
+import logging
 from pathlib import Path
 
+logger = logging.getLogger(__name__)
+
 
 @dataclass
 class SGLangBenchmarkArgs:
@@ -54,3 +58,12 @@ def __repr__(self):
             f"Tokenizer: {self.tokenizer}\n"
             f"Request Rate: {self.request_rate}"
         )
+
+
+def log_jsonl_result(file_path):
+    with open(file_path, "r") as file:
+        json_string = file.readline().strip()
+
+    json_data = json.loads(json_string)
+    for key, val in json_data.items():
+        logger.info(f"{key.upper()}: {val}")
diff --git a/app_tests/integration_tests/llm/utils.py b/app_tests/integration_tests/llm/utils.py
index 05712039e..80b5b3c09 100644
--- a/app_tests/integration_tests/llm/utils.py
+++ b/app_tests/integration_tests/llm/utils.py
@@ -15,7 +15,7 @@
 import requests
 from transformers import AutoTokenizer
 
-logger = logging.getLogger("__name__")
+logger = logging.getLogger(__name__)
 
 
 class AccuracyValidationException(RuntimeError):

From bf8540f482f453f4fa04ac750381135e99815bb1 Mon Sep 17 00:00:00 2001
From: Kyle Wang <ec1wng@gmail.com>
Date: Mon, 25 Nov 2024 23:14:27 +0800
Subject: [PATCH 2/3] Add a DPP intro in amdgpu_kernel_optimization_guide.md
 (#598)

---
 docs/amdgpu_kernel_optimization_guide.md | 123 ++++++++++++++++++++++-
 1 file changed, 122 insertions(+), 1 deletion(-)

diff --git a/docs/amdgpu_kernel_optimization_guide.md b/docs/amdgpu_kernel_optimization_guide.md
index 09c5b59f9..91b7f1385 100644
--- a/docs/amdgpu_kernel_optimization_guide.md
+++ b/docs/amdgpu_kernel_optimization_guide.md
@@ -4,7 +4,7 @@ Author: Jakub Kuderski @kuhar
 
 Date: 2024-06-24
 
-Last Update: 2024-08-22
+Last Update: 2024-11-22
 
 ## Introduction
 
@@ -293,3 +293,124 @@ forms a *clause* that translates to a single data fabric transaction.
 > [!TIP]
 > For allocations of 4 GB or less, you can implement predicated loads using the
 > `buffer` instructions.
+
+## Data-Parallel Primitives and Warp-level Reduction
+
+For cross-lane data sharing, the most straightforward way is LDS. Some lanes
+write data to some locations on LDS and other lanes read data from LDS. Besides,
+there are several instructions can be used to share data cross lanes within a
+wavefront/warp.
+
+Here's a brief introduction of these instructions. Please check out [this
+blog](https://gpuopen.com/learn/amd-gcn-assembly-cross-lane-operations/) for
+details.
+
+### ds_permute/ds_bpermute
+
+`ds_permute`/`ds_bpermute` instructions use LDS hardware for data sharing but
+don't actually write to an LDS location. But it still needs `s_waitcnt`
+instruction to determine when data is returned to `dest` VGPR.
+
+Example:
+```nasm
+ds_bpermute_b32 dest, addr, src [offset:addr_offset]
+```
+
+### ds_swizzle
+
+Compared to `ds_bpermute`, the `ds_swizzle` instruction doesn't require an
+additional VGPR for offset since it's encoded in the instruction.
+
+`ds_swizzle` is likely to have less address generation instructions required
+than `ds_bpermute`.
+
+The cons are:
+1. It only supports limited patterns.
+2. Similar to `ds_bpermute`, `s_waitcnt` is required to wait for the `dest` VGPR.
+
+Example:
+```nasm
+ds_swizzle_b32 dest, src offset:ds_pattern
+```
+
+### Data-Parallel Primitives, DPP
+
+DPP is a 32-bit instruction modifier appended to the normal VALU instructions.
+It allows VALU instructions to access data in neighboring lanes directly, which
+means it doesn't need LDS hardware anymore, hence `s_waitcnt` instructions are
+**not required**.
+
+Unfortunately, it also supported limited patterns like `ds_swizzle`. And there
+are some instructions that can't be modified by DPP.
+
+Example:
+```nasm
+; Normal VALU instruction.
+v_add_f32
+
+; Instruction modified by DPP.
+v_add_f32_dpp
+```
+
+It's worth mentioning that DPP has different names and syntaxes on different
+architectures:
+* CDNA: DPP
+* RDNA: DPP8/DPP16
+
+For details, please check the [MI300 ISA Reference
+Guide](https://www.amd.com/content/dam/amd/en/documents/instinct-tech-docs/instruction-set-architectures/amd-instinct-mi300-cdna3-instruction-set-architecture.pdf)
+and the [RDNA3 ISA Reference
+Guide](https://www.amd.com/content/dam/amd/en/documents/radeon-tech-docs/instruction-set-architectures/rdna3-shader-instruction-set-architecture-feb-2023_0.pdf).
+
+### How to use them in MLIR
+
+Each instruction has a corresponding Op in MLIR (except for `ds_permute`, this
+one is not implemented at the time of writing):
+* `ds_bpermute`: `rocdl.ds_bpermute`
+* `ds_swizzle`: `rocdl.ds_swizzle`
+* DPP: `rocdl.update.dpp`, `amdgpu.dpp` (a thin wrapper around
+  `rocdl.update.dpp` with more comprehensive user interface, e.g., replace magic
+  numbers with enums)
+
+The first 2 are straightforward, while DPP follows a different fashion.
+
+Since DPP is an instruction modifier instead of an instruction itself, there are
+tremendous number of combinations of VALU instructions and DPP. To solve that,
+`rocdl.update.dpp` and `amdgpu.dpp` are designed to be a wrapper of
+`v_mov_b32_dpp` instruction. And it depends on LLVM compiler to fuse it with the
+subsequent VALU instruction **with best efforts**.
+
+For example, `v_mov_b32_dpp` + `v_add_f32_e32` might be fused into `v_add_f32_dpp`.
+
+There are plenty of constraints stopping an instruction from being merged. For
+example, if either the `bank_mask` or the `row_mask` is not `0xf`, it can't be
+fused. You can check the
+[GCNDPPCombine::combineDPPMov](https://github.com/llvm/llvm-project/blob/ab51eccf88f5321e7c60591c5546b254b6afab99/llvm/lib/Target/AMDGPU/GCNDPPCombine.cpp#L522)
+function to see how it works.
+
+### Comparison
+
+To summarize, there's no free lunch: instruction's expressivity comes at the
+expense of performance.
+
+The relative performance of cross-lane instructions is as follows:
+
+DPP > `ds_swizzle` >= `ds_permute` > `ds_bpermute`
+
+while the generality ranking is the reverse:
+
+DPP < `ds_swizzle` < `ds_permute` < `ds_bpermute`
+
+This table presents the approximate instruction latency, collected
+experimentally on Fused Softmax kernel with
+[rocprofv2](https://github.com/ROCm/rocprofiler?tab=readme-ov-file#plugin-support)
+on the MI300 GPU:
+
+| Instructions           | MLIR Op                      | Hardware     | latency/#cycles |
+| ---------------------- | ---------------------------- | ------------ | --------------- |
+| ds_permute/ds_bpermute | rocdl.ds_bpermute            | LDS hardware | ~50*            |
+| ds_swizzle             | rocdl.ds_swizzle             | LDS hardware | ~50*            |
+| DPP                    | rocdl.update.dpp, amdgpu.dpp | VALU         | 4~12            |
+
+*: For `ds_permute`/`ds_bpermute` and `ds_swizzle`, the latency includes the
+instruction itself and its corresponding `s_waitcnt` instruction.

From e906b669b071d1804704b619910b6774f0604070 Mon Sep 17 00:00:00 2001
From: Stephen Baione <109226581+stbaione@users.noreply.github.com>
Date: Mon, 25 Nov 2024 10:53:50 -0600
Subject: [PATCH 3/3] See if stale dependencies are causing shortfin server to
 fail to start (#604)

# Description

We started seeing a failure in `Shortfin CPU LLM Integration Test` after
merging #601. However, the only aspect of the integration test that that
PR touches is a fix in the logger:

Old
```python
logger = logging.getLogger("__name__")
```

New
```python
logger = logging.getLogger(__name__)
```

That shouldn't have an impact on the test, and while reading the output
of the workflow, it didn't seem to be the line that caused the server to
not start.

When testing locally in a fresh environment, the test ran fine, which
made me think that it may be related to stale dependencies. I updated
the hash of cached pip to take into account requirement changes in
`sharktank` and `shortfin`, which appears to fix the test.
---
 .github/workflows/ci-shark-ai.yml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.github/workflows/ci-shark-ai.yml b/.github/workflows/ci-shark-ai.yml
index bf8007e65..fc85a76a7 100644
--- a/.github/workflows/ci-shark-ai.yml
+++ b/.github/workflows/ci-shark-ai.yml
@@ -49,7 +49,7 @@ jobs:
         id: cache-pip
         with:
           path: ${{ env.PIP_CACHE_DIR }}
-          key: pip-${{ steps.setup_python.outputs.python-version }}-${{ hashFiles('*requirements.txt') }}
+          key: pip-${{ steps.setup_python.outputs.python-version }}-${{ hashFiles('*requirements*.txt','shortfin/requirements*.txt','sharktank/requirements*.txt') }}
 
       - name: Install pip deps
         run: |