Skip to content

Commit

Permalink
[tuner]: Add a utility function to query supported MMA intrinsics (ir…
Browse files Browse the repository at this point in the history
…ee-org#19124)

This PR aims to address the task listed in
nod-ai/shark-ai#453: add a utility
function (`QueryMMAIntrinsics`) to query supported MMA intrinsics.

A new test pass `TestLLVMGPUQueryMMAPass` has been added to validate the
correctness of this utility function, along with a corresponding test to
ensure reliable functionality.

TODO: The function will be exposed to both the C API and Python in a
follow-up PR.

---------

Signed-off-by: Bangtian Liu <[email protected]>
Signed-off-by: Giacomo Serafini <[email protected]>
  • Loading branch information
bangtianliu authored and giacs-epic committed Dec 4, 2024
1 parent 4a37916 commit 3757aa2
Show file tree
Hide file tree
Showing 9 changed files with 174 additions and 0 deletions.
1 change: 1 addition & 0 deletions compiler/src/iree/compiler/Codegen/LLVMGPU/BUILD.bazel
Original file line number Diff line number Diff line change
Expand Up @@ -112,6 +112,7 @@ iree_compiler_cc_library(
"ROCDLKernelConfig.cpp",
"ROCDLLowerExecutableTarget.cpp",
"ROCDLSelectLoweringStrategy.cpp",
"TestLLVMGPUQueryMMAPass.cpp",
"Verifiers.cpp",
],
hdrs = [
Expand Down
1 change: 1 addition & 0 deletions compiler/src/iree/compiler/Codegen/LLVMGPU/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -97,6 +97,7 @@ iree_cc_library(
"ROCDLKernelConfig.cpp"
"ROCDLLowerExecutableTarget.cpp"
"ROCDLSelectLoweringStrategy.cpp"
"TestLLVMGPUQueryMMAPass.cpp"
"Verifiers.cpp"
DEPS
::PassHeaders
Expand Down
5 changes: 5 additions & 0 deletions compiler/src/iree/compiler/Codegen/LLVMGPU/Passes.td
Original file line number Diff line number Diff line change
Expand Up @@ -163,4 +163,9 @@ def TestLLVMGPUScalarizeMathOpPass :
let summary = "Test pass for several legalization patterns.";
}

def TestLLVMGPUQueryMMAPass :
Pass<"iree-test-llvmgpu-query-mma", "ModuleOp"> {
let summary = "Test pass for querying the supported mma intrinsic instructions.";
}

#endif // IREE_CODEGEN_LLVMGPU_PASSES
Original file line number Diff line number Diff line change
@@ -0,0 +1,40 @@
// Copyright 2024 The IREE Authors
//
// Licensed under the Apache License v2.0 with LLVM Exceptions.
// See https://llvm.org/LICENSE.txt for license information.
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception

#include "iree/compiler/Codegen/LLVMGPU/Passes.h"
#include "iree/compiler/Codegen/Utils/GPUUtils.h"
#include "mlir/Dialect/Func/IR/FuncOps.h"

#include "llvm/Support/Debug.h"

#define DEBUG_TYPE "iree-test-llvmgpu-query-mma"

namespace mlir::iree_compiler {

#define GEN_PASS_DEF_TESTLLVMGPUQUERYMMAPASS
#include "iree/compiler/Codegen/LLVMGPU/Passes.h.inc"

namespace {

struct TestLLVMGPUQueryMMAPass final
: impl::TestLLVMGPUQueryMMAPassBase<TestLLVMGPUQueryMMAPass> {
void runOnOperation() override {
ModuleOp moduleOp = getOperation();
llvm::SmallDenseMap<IREE::HAL::ExecutableVariantOp,
SmallVector<IREE::GPU::MMAIntrinsic>>
mmaMap = queryMMAIntrinsics(moduleOp);
for (const auto &[op, mmaAttrs] : mmaMap) {
llvm::outs() << "Executable Variant Name: "
<< cast<IREE::HAL::ExecutableVariantOp>(*op).getName()
<< "\n";
llvm::outs() << "MMA Intrinsics: ";
llvm::interleave(mmaAttrs, llvm::outs(), " ");
llvm::outs() << "\n";
}
}
};
} // namespace
} // namespace mlir::iree_compiler
Original file line number Diff line number Diff line change
Expand Up @@ -55,6 +55,7 @@ iree_lit_test_suite(
"promote_matmul_to_fit_mma.mlir",
"tensor_pad.mlir",
"tensorcore_vectorization.mlir",
"test_query_mma.mlir",
"transform_dialect_bufferize.mlir",
"transform_dialect_eliminate_gpu_barriers.mlir",
"transform_dialect_hoist_allocs.mlir",
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -52,6 +52,7 @@ iree_lit_test_suite(
"rocdl_pipeline_test.mlir"
"tensor_pad.mlir"
"tensorcore_vectorization.mlir"
"test_query_mma.mlir"
"transform_dialect_bufferize.mlir"
"transform_dialect_eliminate_gpu_barriers.mlir"
"transform_dialect_hoist_allocs.mlir"
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,98 @@
// RUN: iree-opt --split-input-file --iree-test-llvmgpu-query-mma %s | FileCheck %s

#executable_target_rocm_hsaco_fb = #hal.executable.target<"rocm", "rocm-hsaco-fb",
{iree.gpu.target = #iree_gpu.target<arch = "gfx942", features = "",
wgp = <compute = int32, storage = b32,
subgroup = arithmetic, dot = dp4xi8toi32,
mma = [<MFMA_F32_16x16x4_F32>, <MFMA_F32_16x16x16_F16>],
subgroup_size_choices = [64], max_workgroup_sizes = [1024],
max_thread_count_per_workgroup = 1024, max_workgroup_memory_bytes = 65536,
max_workgroup_counts = [2147483647]>>}>
#pipeline_layout = #hal.pipeline.layout<bindings = [#hal.pipeline.binding<storage_buffer>]>
module {
hal.executable private @main {
hal.executable.variant public @main target(#executable_target_rocm_hsaco_fb) {
hal.executable.export public @entry_point layout(#pipeline_layout)
builtin.module {
func.func @fn() {
return
}
}
}
}
}

// CHECK: Executable Variant Name
// CHECK-SAME: main
// CHECK: MMA Intrinsics
// CHECK-SAME: MFMA_F32_16x16x4_F32
// CHECK-SAME: MFMA_F32_16x16x16_F16
// CHECK-LABEL: func.func @fn

// -----

#executable_target_rocm_hsaco_fb0 = #hal.executable.target<"rocm", "rocm-hsaco-fb",
{iree.gpu.target = #iree_gpu.target<arch = "gfx942", features = "",
wgp = <compute = int32, storage = b32,
subgroup = arithmetic, dot = dp4xi8toi32,
mma = [<MFMA_F32_16x16x4_F32>, <MFMA_F32_16x16x16_F16>],
subgroup_size_choices = [64], max_workgroup_sizes = [1024],
max_thread_count_per_workgroup = 1024, max_workgroup_memory_bytes = 65536,
max_workgroup_counts = [2147483647]>>}>
#executable_target_rocm_hsaco_fb1 = #hal.executable.target<"rocm", "rocm-hsaco-fb",
{iree.gpu.target = #iree_gpu.target<arch = "gfx942", features = "",
wgp = <compute = int32, storage = b32,
subgroup = arithmetic, dot = dp4xi8toi32,
mma = [<MFMA_F32_32x32x8_F16>, <MFMA_F32_16x16x16_BF16>],
subgroup_size_choices = [64], max_workgroup_sizes = [1024],
max_thread_count_per_workgroup = 1024, max_workgroup_memory_bytes = 65536,
max_workgroup_counts = [2147483647]>>}>
#pipeline_layout = #hal.pipeline.layout<bindings = [#hal.pipeline.binding<storage_buffer>]>
module {
hal.executable private @main_0 {
hal.executable.variant public @main_0 target(#executable_target_rocm_hsaco_fb0) {
hal.executable.export public @entry_point_0 layout(#pipeline_layout)
builtin.module {
func.func @fn_0() {
return
}
}
}
}
hal.executable private @main_1 {
hal.executable.variant public @main_1 target(#executable_target_rocm_hsaco_fb1) {
hal.executable.export public @entry_point layout(#pipeline_layout)
builtin.module {
func.func @fn_1() {
return
}
}
}
}
}

// CHECK-DAG: main_0
// CHECK-DAG: MMA Intrinsics: MFMA_F32_16x16x4_F32 MFMA_F32_16x16x16_F16
// CHECK-DAG: main_1
// CHECK-DAG: MMA Intrinsics: MFMA_F32_32x32x8_F16 MFMA_F32_16x16x16_BF16

// -----

#executable_target_rocm_hsaco_fb = #hal.executable.target<"rocm", "rocm-hsaco-fb">
#pipeline_layout = #hal.pipeline.layout<bindings = [#hal.pipeline.binding<storage_buffer>]>
module {
hal.executable private @main {
hal.executable.variant public @main target(#executable_target_rocm_hsaco_fb) {
hal.executable.export public @entry_point layout(#pipeline_layout)
builtin.module {
func.func @fn_empty() {
return
}
}
}
}
}

// CHECK-NOT: Executable Variant Name
// CHECK-NOT: MMA Intrinsics
// CHECK-LABEL: func.func @fn
18 changes: 18 additions & 0 deletions compiler/src/iree/compiler/Codegen/Utils/GPUUtils.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -1028,4 +1028,22 @@ std::optional<int> getGPUSubgroupSize(mlir::FunctionOpInterface func) {
return std::nullopt;
}

llvm::SmallDenseMap<IREE::HAL::ExecutableVariantOp,
SmallVector<IREE::GPU::MMAIntrinsic>>
queryMMAIntrinsics(mlir::ModuleOp moduleOp) {
llvm::SmallDenseMap<IREE::HAL::ExecutableVariantOp,
SmallVector<IREE::GPU::MMAIntrinsic>>
mmaAttributesMap;
moduleOp.walk([&](IREE::HAL::ExecutableVariantOp executableOp) {
if (IREE::GPU::TargetAttr target = getGPUTargetAttr(executableOp)) {
auto mmaIntrinsics = llvm::map_to_vector(
target.getWgp().getMma(), [](IREE::GPU::MMAAttr attr) {
return attr.getIntrinsic().getValue();
});
mmaAttributesMap[executableOp] = std::move(mmaIntrinsics);
}
});
return mmaAttributesMap;
}

} // namespace mlir::iree_compiler
9 changes: 9 additions & 0 deletions compiler/src/iree/compiler/Codegen/Utils/GPUUtils.h
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,7 @@
#define IREE_COMPILER_CODEGEN_UTILS_GPUUTILS_H_

#include "iree/compiler/Codegen/Dialect/GPU/IR/IREEGPUAttrs.h"
#include "iree/compiler/Dialect/HAL/IR/HALOps.h"
#include "iree/compiler/Dialect/HAL/IR/HALTypes.h"
#include "mlir/Dialect/GPU/IR/GPUDialect.h"
#include "mlir/Dialect/Linalg/Utils/Utils.h"
Expand Down Expand Up @@ -206,6 +207,14 @@ IREE::GPU::TargetAttr getGPUTargetAttr(Operation *op);
/// Returns std::nullopt if none found.
std::optional<int> getGPUSubgroupSize(mlir::FunctionOpInterface func);

/// Returns a map of supported MMA intrinsic instructions based on the
/// GPU target descriptions in `moduleOp`. Each entry in the map associates
/// an `IREE::HAL::ExecutableVariantOp` with a vector of
/// `IREE::GPU::MMAIntrinsic` attributes.
llvm::SmallDenseMap<IREE::HAL::ExecutableVariantOp,
SmallVector<IREE::GPU::MMAIntrinsic>>
queryMMAIntrinsics(mlir::ModuleOp moduleOp);

} // namespace mlir::iree_compiler

#endif // IREE_COMPILER_CODEGEN_UTILS_GPUUTILS_H_

0 comments on commit 3757aa2

Please sign in to comment.