From a35890fc989cbb886d7523a355884d95559a25a8 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Markus=20B=C3=B6ck?= Date: Sun, 1 Sep 2024 14:45:56 +0100 Subject: [PATCH] [DMA] Clarify concurrent behaviour of transfers and fix `start_tensor_copy` (#129) The described semantics give more flexibility to the backend and additionally fixes issues we've seen with Snitch's DMA. --- .../src/Quidditch/Dialect/DMA/IR/DMAOps.cpp | 10 ++++++++-- .../compiler/src/Quidditch/Dialect/DMA/IR/DMAOps.td | 13 ++++++++++++- codegen/tests/Dialect/DMA/IR/bufferization.mlir | 3 ++- 3 files changed, 22 insertions(+), 4 deletions(-) diff --git a/codegen/compiler/src/Quidditch/Dialect/DMA/IR/DMAOps.cpp b/codegen/compiler/src/Quidditch/Dialect/DMA/IR/DMAOps.cpp index 1208ad0..913bfab 100644 --- a/codegen/compiler/src/Quidditch/Dialect/DMA/IR/DMAOps.cpp +++ b/codegen/compiler/src/Quidditch/Dialect/DMA/IR/DMAOps.cpp @@ -290,8 +290,14 @@ StartTensorCopyOp::bufferize(RewriterBase &rewriter, // Zero out the entire buffer prior to overwriting it with the copied values. // TODO: This could be optimized to only zero regions that won't be filled // with the copied values at the cost of 2^rank transfers instead of two. - if (hasPadding() && !getUndefPadding()) - rewriter.create(getLoc(), *alloc); + if (hasPadding() && !getUndefPadding()) { + Value token = rewriter.create(getLoc(), *alloc); + // TODO: This wait is required as the zero mem transfer writes to the same + // memory as the transfer below. This is currently unspecified + // behaviour (both in the DMA dialect and in Snitch as far as we + // know). + rewriter.create(getLoc(), token); + } // Subview into the original memory without any padding. // As we only add padding at the end of the dimensions, the offsets are always diff --git a/codegen/compiler/src/Quidditch/Dialect/DMA/IR/DMAOps.td b/codegen/compiler/src/Quidditch/Dialect/DMA/IR/DMAOps.td index b5f23fd..7307326 100644 --- a/codegen/compiler/src/Quidditch/Dialect/DMA/IR/DMAOps.td +++ b/codegen/compiler/src/Quidditch/Dialect/DMA/IR/DMAOps.td @@ -133,7 +133,12 @@ def DMA_StartTransferOp : DMA_Op<"start_transfer", The DMA operation is likely (but not guaranteed) to run asynchronous and its completion only guaranteed by executing the `wait_for_transfers` - operation with the token returned by this operation or a later one. + operation with the token returned by this operation. + + Due to the unspecified order and concurrency of transfers, the resulting + state of a MemRef is unspecified if at any point two transfers not-yet + completed transfers exist that either write to the same memory location + or writes to a memory location read by another transfer. }]; let arguments = (ins @@ -154,7 +159,13 @@ def DMA_StartZeroMemTransferOp : DMA_Op<"start_zero_mem_transfer", [MemoryEffects<[MemWrite]>]> { let description = [{ + Starts a DMA transfer which when completed has filled the given MemRef + entirely with bit of 0. + I.e. this is equal to C's `memset` with zero, but asynchronous. + The semantics are identical to a `start_transfer` operation where the + source is a MemRef identical in shape to `filled` consisting of just + 0 bits. }]; let arguments = (ins diff --git a/codegen/tests/Dialect/DMA/IR/bufferization.mlir b/codegen/tests/Dialect/DMA/IR/bufferization.mlir index c6843fc..c3a1802 100644 --- a/codegen/tests/Dialect/DMA/IR/bufferization.mlir +++ b/codegen/tests/Dialect/DMA/IR/bufferization.mlir @@ -91,7 +91,8 @@ func.func @tensor_copy_pad(%arg0 : tensor, %pad0 : index, %pad1 : index // CHECK: %[[NEW_DIM0:.*]] = affine.apply #[[$MAP2]]()[%[[DIM0]], %[[PAD0]]] // CHECK: %[[NEW_DIM1:.*]] = affine.apply #[[$MAP2]]()[%[[DIM1]], %[[PAD1]]] // CHECK: %[[ALLOC:.*]] = memref.alloc(%[[NEW_DIM0]], %[[NEW_DIM1]]) - // CHECK: start_zero_mem_transfer %[[ALLOC]] + // CHECK: %[[ZERO_TOKEN:.*]] = dma.start_zero_mem_transfer %[[ALLOC]] + // CHECK: dma.wait_for_transfers %[[ZERO_TOKEN]] // CHECK: %[[UNPADDED:.*]] = memref.subview %[[ALLOC]][0, 0] [%[[DIM0]], %[[DIM1]]] [1, 1] // CHECK: %[[TOKEN:.*]] = dma.start_transfer from %[[COPY]] // CHECK-SAME: to %[[UNPADDED]]