[tuner]: Add a utility function to query supported MMA intrinsics (#1…

…9124) This PR aims to address the task listed in nod-ai/shark-ai#453: add a utility function (`QueryMMAIntrinsics`) to query supported MMA intrinsics. A new test pass `TestLLVMGPUQueryMMAPass` has been added to validate the correctness of this utility function, along with a corresponding test to ensure reliable functionality. TODO: The function will be exposed to both the C API and Python in a follow-up PR. --------- Signed-off-by: Bangtian Liu <[email protected]>
iree-org · Nov 14, 2024 · 9eaa4ef · 9eaa4ef
1 parent 4c0fd90
commit 9eaa4ef
Show file tree

Hide file tree

Showing 9 changed files with 174 additions and 0 deletions.
diff --git a/compiler/src/iree/compiler/Codegen/LLVMGPU/BUILD.bazel b/compiler/src/iree/compiler/Codegen/LLVMGPU/BUILD.bazel
@@ -112,6 +112,7 @@ iree_compiler_cc_library(
         "ROCDLKernelConfig.cpp",
         "ROCDLLowerExecutableTarget.cpp",
         "ROCDLSelectLoweringStrategy.cpp",
+        "TestLLVMGPUQueryMMAPass.cpp",
         "Verifiers.cpp",
     ],
     hdrs = [

diff --git a/compiler/src/iree/compiler/Codegen/LLVMGPU/CMakeLists.txt b/compiler/src/iree/compiler/Codegen/LLVMGPU/CMakeLists.txt
@@ -97,6 +97,7 @@ iree_cc_library(
     "ROCDLKernelConfig.cpp"
     "ROCDLLowerExecutableTarget.cpp"
     "ROCDLSelectLoweringStrategy.cpp"
+    "TestLLVMGPUQueryMMAPass.cpp"
     "Verifiers.cpp"
   DEPS
     ::PassHeaders

diff --git a/compiler/src/iree/compiler/Codegen/LLVMGPU/Passes.td b/compiler/src/iree/compiler/Codegen/LLVMGPU/Passes.td
@@ -163,4 +163,9 @@ def TestLLVMGPUScalarizeMathOpPass :
   let summary = "Test pass for several legalization patterns.";
 }
 
+def TestLLVMGPUQueryMMAPass :
+    Pass<"iree-test-llvmgpu-query-mma", "ModuleOp"> {
+  let summary = "Test pass for querying the supported mma intrinsic instructions.";
+}
+
 #endif // IREE_CODEGEN_LLVMGPU_PASSES
diff --git a/compiler/src/iree/compiler/Codegen/LLVMGPU/TestLLVMGPUQueryMMAPass.cpp b/compiler/src/iree/compiler/Codegen/LLVMGPU/TestLLVMGPUQueryMMAPass.cpp
@@ -0,0 +1,40 @@
+// Copyright 2024 The IREE Authors
+//
+// Licensed under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+#include "iree/compiler/Codegen/LLVMGPU/Passes.h"
+#include "iree/compiler/Codegen/Utils/GPUUtils.h"
+#include "mlir/Dialect/Func/IR/FuncOps.h"
+
+#include "llvm/Support/Debug.h"
+
+#define DEBUG_TYPE "iree-test-llvmgpu-query-mma"
+
+namespace mlir::iree_compiler {
+
+#define GEN_PASS_DEF_TESTLLVMGPUQUERYMMAPASS
+#include "iree/compiler/Codegen/LLVMGPU/Passes.h.inc"
+
+namespace {
+
+struct TestLLVMGPUQueryMMAPass final
+    : impl::TestLLVMGPUQueryMMAPassBase<TestLLVMGPUQueryMMAPass> {
+  void runOnOperation() override {
+    ModuleOp moduleOp = getOperation();
+    llvm::SmallDenseMap<IREE::HAL::ExecutableVariantOp,
+                        SmallVector<IREE::GPU::MMAIntrinsic>>
+        mmaMap = queryMMAIntrinsics(moduleOp);
+    for (const auto &[op, mmaAttrs] : mmaMap) {
+      llvm::outs() << "Executable Variant Name: "
+                   << cast<IREE::HAL::ExecutableVariantOp>(*op).getName()
+                   << "\n";
+      llvm::outs() << "MMA Intrinsics: ";
+      llvm::interleave(mmaAttrs, llvm::outs(), " ");
+      llvm::outs() << "\n";
+    }
+  }
+};
+} // namespace
+} // namespace mlir::iree_compiler
diff --git a/compiler/src/iree/compiler/Codegen/LLVMGPU/test/BUILD.bazel b/compiler/src/iree/compiler/Codegen/LLVMGPU/test/BUILD.bazel
@@ -55,6 +55,7 @@ iree_lit_test_suite(
             "promote_matmul_to_fit_mma.mlir",
             "tensor_pad.mlir",
             "tensorcore_vectorization.mlir",
+            "test_query_mma.mlir",
             "transform_dialect_bufferize.mlir",
             "transform_dialect_eliminate_gpu_barriers.mlir",
             "transform_dialect_hoist_allocs.mlir",

diff --git a/compiler/src/iree/compiler/Codegen/LLVMGPU/test/CMakeLists.txt b/compiler/src/iree/compiler/Codegen/LLVMGPU/test/CMakeLists.txt
@@ -52,6 +52,7 @@ iree_lit_test_suite(
     "rocdl_pipeline_test.mlir"
     "tensor_pad.mlir"
     "tensorcore_vectorization.mlir"
+    "test_query_mma.mlir"
     "transform_dialect_bufferize.mlir"
     "transform_dialect_eliminate_gpu_barriers.mlir"
     "transform_dialect_hoist_allocs.mlir"

diff --git a/compiler/src/iree/compiler/Codegen/LLVMGPU/test/test_query_mma.mlir b/compiler/src/iree/compiler/Codegen/LLVMGPU/test/test_query_mma.mlir
@@ -0,0 +1,98 @@
+// RUN: iree-opt --split-input-file --iree-test-llvmgpu-query-mma %s | FileCheck %s
+
+#executable_target_rocm_hsaco_fb = #hal.executable.target<"rocm", "rocm-hsaco-fb",
+{iree.gpu.target = #iree_gpu.target<arch = "gfx942", features = "",
+wgp = <compute = int32, storage =  b32,
+subgroup = arithmetic, dot = dp4xi8toi32,
+mma = [<MFMA_F32_16x16x4_F32>, <MFMA_F32_16x16x16_F16>],
+subgroup_size_choices = [64], max_workgroup_sizes = [1024],
+max_thread_count_per_workgroup = 1024, max_workgroup_memory_bytes = 65536,
+max_workgroup_counts = [2147483647]>>}>
+#pipeline_layout = #hal.pipeline.layout<bindings = [#hal.pipeline.binding<storage_buffer>]>
+module {
+  hal.executable private @main {
+    hal.executable.variant public @main target(#executable_target_rocm_hsaco_fb) {
+      hal.executable.export public @entry_point layout(#pipeline_layout)
+      builtin.module {
+        func.func @fn() {
+          return
+        }
+      }
+    }
+  }
+}
+
+// CHECK:       Executable Variant Name
+// CHECK-SAME:  main
+// CHECK: MMA   Intrinsics
+// CHECK-SAME:  MFMA_F32_16x16x4_F32
+// CHECK-SAME:  MFMA_F32_16x16x16_F16
+// CHECK-LABEL: func.func @fn
+
+// -----
+
+#executable_target_rocm_hsaco_fb0 = #hal.executable.target<"rocm", "rocm-hsaco-fb",
+{iree.gpu.target = #iree_gpu.target<arch = "gfx942", features = "",
+wgp = <compute = int32, storage =  b32,
+subgroup = arithmetic, dot = dp4xi8toi32,
+mma = [<MFMA_F32_16x16x4_F32>, <MFMA_F32_16x16x16_F16>],
+subgroup_size_choices = [64], max_workgroup_sizes = [1024],
+max_thread_count_per_workgroup = 1024, max_workgroup_memory_bytes = 65536,
+max_workgroup_counts = [2147483647]>>}>
+#executable_target_rocm_hsaco_fb1 = #hal.executable.target<"rocm", "rocm-hsaco-fb",
+{iree.gpu.target = #iree_gpu.target<arch = "gfx942", features = "",
+wgp = <compute = int32, storage =  b32,
+subgroup = arithmetic, dot = dp4xi8toi32,
+mma = [<MFMA_F32_32x32x8_F16>, <MFMA_F32_16x16x16_BF16>],
+subgroup_size_choices = [64], max_workgroup_sizes = [1024],
+max_thread_count_per_workgroup = 1024, max_workgroup_memory_bytes = 65536,
+max_workgroup_counts = [2147483647]>>}>
+#pipeline_layout = #hal.pipeline.layout<bindings = [#hal.pipeline.binding<storage_buffer>]>
+module {
+  hal.executable private @main_0 {
+    hal.executable.variant public @main_0 target(#executable_target_rocm_hsaco_fb0) {
+      hal.executable.export public @entry_point_0 layout(#pipeline_layout)
+      builtin.module {
+        func.func @fn_0() {
+          return
+        }
+      }
+    }
+  }
+  hal.executable private @main_1 {
+    hal.executable.variant public @main_1 target(#executable_target_rocm_hsaco_fb1) {
+      hal.executable.export public @entry_point layout(#pipeline_layout)
+      builtin.module {
+        func.func @fn_1() {
+          return
+        }
+      }
+    }
+  }
+}
+
+// CHECK-DAG: main_0
+// CHECK-DAG: MMA Intrinsics: MFMA_F32_16x16x4_F32 MFMA_F32_16x16x16_F16
+// CHECK-DAG: main_1
+// CHECK-DAG: MMA Intrinsics: MFMA_F32_32x32x8_F16 MFMA_F32_16x16x16_BF16
+
+// -----
+
+#executable_target_rocm_hsaco_fb = #hal.executable.target<"rocm", "rocm-hsaco-fb">
+#pipeline_layout = #hal.pipeline.layout<bindings = [#hal.pipeline.binding<storage_buffer>]>
+module {
+  hal.executable private @main {
+    hal.executable.variant public @main target(#executable_target_rocm_hsaco_fb) {
+      hal.executable.export public @entry_point layout(#pipeline_layout)
+      builtin.module {
+        func.func @fn_empty() {
+          return
+        }
+      }
+    }
+  }
+}
+
+// CHECK-NOT:   Executable Variant Name
+// CHECK-NOT:   MMA Intrinsics
+// CHECK-LABEL: func.func @fn
diff --git a/compiler/src/iree/compiler/Codegen/Utils/GPUUtils.cpp b/compiler/src/iree/compiler/Codegen/Utils/GPUUtils.cpp
@@ -1028,4 +1028,22 @@ std::optional<int> getGPUSubgroupSize(mlir::FunctionOpInterface func) {
   return std::nullopt;
 }
 
+llvm::SmallDenseMap<IREE::HAL::ExecutableVariantOp,
+                    SmallVector<IREE::GPU::MMAIntrinsic>>
+queryMMAIntrinsics(mlir::ModuleOp moduleOp) {
+  llvm::SmallDenseMap<IREE::HAL::ExecutableVariantOp,
+                      SmallVector<IREE::GPU::MMAIntrinsic>>
+      mmaAttributesMap;
+  moduleOp.walk([&](IREE::HAL::ExecutableVariantOp executableOp) {
+    if (IREE::GPU::TargetAttr target = getGPUTargetAttr(executableOp)) {
+      auto mmaIntrinsics = llvm::map_to_vector(
+          target.getWgp().getMma(), [](IREE::GPU::MMAAttr attr) {
+            return attr.getIntrinsic().getValue();
+          });
+      mmaAttributesMap[executableOp] = std::move(mmaIntrinsics);
+    }
+  });
+  return mmaAttributesMap;
+}
+
 } // namespace mlir::iree_compiler
diff --git a/compiler/src/iree/compiler/Codegen/Utils/GPUUtils.h b/compiler/src/iree/compiler/Codegen/Utils/GPUUtils.h
@@ -8,6 +8,7 @@
 #define IREE_COMPILER_CODEGEN_UTILS_GPUUTILS_H_
 
 #include "iree/compiler/Codegen/Dialect/GPU/IR/IREEGPUAttrs.h"
+#include "iree/compiler/Dialect/HAL/IR/HALOps.h"
 #include "iree/compiler/Dialect/HAL/IR/HALTypes.h"
 #include "mlir/Dialect/GPU/IR/GPUDialect.h"
 #include "mlir/Dialect/Linalg/Utils/Utils.h"
@@ -206,6 +207,14 @@ IREE::GPU::TargetAttr getGPUTargetAttr(Operation *op);
 /// Returns std::nullopt if none found.
 std::optional<int> getGPUSubgroupSize(mlir::FunctionOpInterface func);
 
+/// Returns a map of supported MMA intrinsic instructions based on the
+/// GPU target descriptions in `moduleOp`. Each entry in the map associates
+/// an `IREE::HAL::ExecutableVariantOp` with a vector of
+/// `IREE::GPU::MMAIntrinsic` attributes.
+llvm::SmallDenseMap<IREE::HAL::ExecutableVariantOp,
+                    SmallVector<IREE::GPU::MMAIntrinsic>>
+queryMMAIntrinsics(mlir::ModuleOp moduleOp);
+
 } // namespace mlir::iree_compiler
 
 #endif // IREE_COMPILER_CODEGEN_UTILS_GPUUTILS_H_