Support any-dimensional memrefs in AllocsToSLM pass (#411)

dchigarev · web-flow · commit 9978725064cd · 2024-11-21T14:37:42.000+01:00
* nD SLM

Signed-off-by: dchigarev &lt;dmitry.chigarev@intel.com&gt;

* Polish &amp; tests

Signed-off-by: dchigarev &lt;dmitry.chigarev@intel.com&gt;

* fix compile warnings

Signed-off-by: dchigarev &lt;dmitry.chigarev@intel.com&gt;

* add 1d test

Signed-off-by: dchigarev &lt;dmitry.chigarev@intel.com&gt;

* use static array instead of a vector

Signed-off-by: dchigarev &lt;dmitry.chigarev@intel.com&gt;

---------

Signed-off-by: dchigarev &lt;dmitry.chigarev@intel.com&gt;
diff --git a/lib/gc/Transforms/GPU/AllocsToSLM.cpp b/lib/gc/Transforms/GPU/AllocsToSLM.cpp
@@ -8,6 +8,7 @@
 
 #include "gc/Transforms/Passes.h"
 
+#include "mlir/Dialect/Affine/IR/AffineOps.h"
 #include "mlir/Dialect/Func/IR/FuncOps.h"
 #include "mlir/Dialect/GPU/TransformOps/Utils.h"
 #include "mlir/Dialect/MemRef/IR/MemRef.h"
@@ -80,22 +81,16 @@ struct ConvertAlloc : public OpRewritePattern<memref::AllocOp> {
       return rewriter.notifyMatchFailure(
           allocOp, "Only support constant block sizes for now");
 
-    int64_t xI = xSz.value();
-    int64_t yI = ySz.value();
-    int64_t zI = zSz.value();
-
-    if (zI != 1)
-      return rewriter.notifyMatchFailure(
-          allocOp, "Only support 2D shared memory for now");
-
+    int64_t blockSizes[3] = {xSz.value(), ySz.value(), zSz.value()};
     MemRefType originalMemRefType = cast<MemRefType>(memref.getType());
     auto originalShape = originalMemRefType.getShape();
 
-    // Scale the allocation size by the number of threads in the work-group
-    int64_t newX = originalShape[0] * xI;
-    int64_t newY = originalShape[1] * yI;
-
-    SmallVector<int64_t> newShape = {newX, newY};
+    // Scale the allocation size (X dimension) by the number of threads in the
+    // work-group
+    int64_t newX =
+        originalShape[0] * blockSizes[0] * blockSizes[1] * blockSizes[2];
+    SmallVector<int64_t> newShape({newX});
+    newShape.append(originalShape.begin() + 1, originalShape.end());
 
     IntegerAttr sharedAddressSpace =
         IntegerAttr::get(rewriter.getIntegerType(64),
@@ -111,27 +106,29 @@ struct ConvertAlloc : public OpRewritePattern<memref::AllocOp> {
                                      allocOp.getOperands())
             .getResult();
 
-    // Compute the offsets in SLM chunk for the current thread
-    auto origXConst = rewriter.create<arith::ConstantIndexOp>(allocOp.getLoc(),
-                                                              originalShape[0]);
-    auto origYConst = rewriter.create<arith::ConstantIndexOp>(allocOp.getLoc(),
-                                                              originalShape[1]);
+    // Compute the offsets in SLM chunk for the current thread:
+    // X_off = (Xthr_i * Ybl_sz * Zbl_sz + Ythr_i * Zbl_sz + Zthr_i) * Xchunk_sz
+    // Offsets for other dimensions = 0
+    auto xI = getAffineDimExpr(0, rewriter.getContext());
+    auto yI = getAffineDimExpr(1, rewriter.getContext());
+    auto zI = getAffineDimExpr(2, rewriter.getContext());
+    auto idxExpr =
+        (xI * blockSizes[1] * blockSizes[2] + yI * blockSizes[2] + zI) *
+        originalShape[0];
+    auto idxMap = AffineMap::get(/*dimCount=*/3, /*symbolCount=*/0, idxExpr);
 
     auto threadIds = launchOp.getThreadIds();
+    auto offX = rewriter.create<affine::AffineApplyOp>(
+        allocOp.getLoc(), idxMap,
+        /*exprOperands=*/ValueRange({threadIds.x, threadIds.y, threadIds.z}));
 
-    auto offX =
-        rewriter
-            .create<arith::MulIOp>(allocOp.getLoc(), threadIds.x, origXConst)
-            .getResult();
-    auto offY =
-        rewriter
-            .create<arith::MulIOp>(allocOp.getLoc(), threadIds.y, origYConst)
-            .getResult();
+    SmallVector<int64_t> staticOffsets({ShapedType::kDynamic});
+    staticOffsets.insert(staticOffsets.end(), originalShape.size() - 1, 0);
 
-    auto offsets = getMixedValues({ShapedType::kDynamic, ShapedType::kDynamic},
-                                  {offX, offY}, rewriter);
+    auto offsets = getMixedValues(staticOffsets, {offX}, rewriter);
     auto sizes = getMixedValues(originalShape, {}, rewriter);
-    auto strides = getMixedValues({1, 1}, {}, rewriter);
+    auto strides = getMixedValues(SmallVector<int64_t>(originalShape.size(), 1),
+                                  {}, rewriter);
 
     auto newSlice =
         rewriter
diff --git a/lib/gc/Transforms/GPU/LinalgToXeGPU.cpp b/lib/gc/Transforms/GPU/LinalgToXeGPU.cpp
@@ -62,6 +62,28 @@ static Value createFullMask(PatternRewriter &rewriter, Location loc,
   return res.getResult();
 }
 
+// Extracts the offsets from a subview operation as values.
+// The differense from mlir::getMixedOffsets is that this function
+// returns the offsets as mlir::Value that can already be used as an argument
+// for other mlir::Operations.
+static SmallVector<Value> extractOffsetsAsValues(PatternRewriter &rewriter,
+                                                 Location loc,
+                                                 memref::SubViewOp subview) {
+  SmallVector<Value> offsetValues;
+  auto staticOffsets = subview.getStaticOffsets();
+  auto dynamicOffsets = subview.getOffsets();
+  size_t dynIdx = 0;
+  for (size_t i = 0; i < staticOffsets.size(); i++) {
+    if (staticOffsets[i] == ShapedType::kDynamic)
+      offsetValues.push_back(dynamicOffsets[dynIdx++]);
+    else
+      offsetValues.push_back(
+          rewriter.create<arith::ConstantIndexOp>(loc, staticOffsets[i]));
+  }
+
+  return offsetValues;
+}
+
 // Max number of elements to load/store from SLM
 constexpr int64_t maxSLMTileSize = 32;
 
@@ -841,8 +863,11 @@ static SmallVector<Value> createSLMDescTiles(PatternRewriter &rewriter,
   // GPU kernel. We have to merge the subview offsets into the descriptor
   // offset.
   if (auto subView = dyn_cast<memref::SubViewOp>(src.getDefiningOp())) {
-    auto xIntOffs = subView.getOffsets()[0];
-    auto yIntOffs = subView.getOffsets()[1];
+    auto offsets = extractOffsetsAsValues(rewriter, loc, subView);
+    assert(offsets.size() == 2 && "Expected 2D subview offsets");
+
+    auto xIntOffs = offsets[0];
+    auto yIntOffs = offsets[1];
 
     // compute 'blockOffset' (beginning of the subview block in the original
     // flat memref)
diff --git a/test/mlir/test/gc/Transforms/GPU/allocs-to-slm-1d.mlir b/test/mlir/test/gc/Transforms/GPU/allocs-to-slm-1d.mlir
@@ -0,0 +1,43 @@
+// RUN: gc-opt %s --allocs-to-slm | FileCheck %s
+
+// Computex thread offset for SLM: (Xthread_idx * Yblock_sz * Zblock_sz + Ythread_idx * Zblock_sz + Zthread_idx) * Xchunk_size
+// CHECK: #map = affine_map<(d0, d1, d2) -> ((d0 * 12 + d1 * 4 + d2) * 256)>
+
+func.func @entry() {
+  %c1 = arith.constant 1 : index
+  %c2 = arith.constant 2 : index
+  %c3 = arith.constant 3 : index
+  %c4 = arith.constant 4 : index
+
+  // Memory space wasn't assigned as it's allocated outside of gpu.launch block
+  // CHECK: %[[NEW_MEMREF_0:.*]] = memref.alloc() : memref<256xf16>
+  %0 = memref.alloc() : memref<256xf16>
+  // Capture thread-id variables
+  // CHECK: gpu.launch blocks(%[[ARG0:.+]], %[[ARG1:.+]], %[[ARG2:.+]]) in (%[[ARG6:.+]] = %c2, %[[ARG7:.+]] = %c2, %[[ARG8:.+]] = %c1) threads
+  // CHECK-SAME: (%[[THREAD_X:.+]], %[[THREAD_Y:.+]], %[[THREAD_Z:.+]]) in
+  // CHECK-SAME: (%[[ARG9:.+]] = %c2, %[[ARG10:.+]] = %c3, %[[ARG11:.+]] = %c4) {
+  gpu.launch blocks(%bx, %by, %bz) in (%sz_bx = %c2, %sz_by = %c2, %sz_bz = %c1)
+             threads(%tx, %ty, %tz) in (%sz_tx = %c2, %sz_ty = %c3, %sz_tz = %c4) {
+    // Memory space was changed as it's explicitly specifided
+    // CHECK: %[[NEW_MEMREF_1:.*]] = memref.alloc() : memref<256xf16, 1>
+    %1 = memref.alloc() : memref<256xf16, 1>
+    // Added 'shared' memory space and allocated SLM for each thread (2 * 3 * 4 = 24; 24 * 256 = 6144)
+    // CHECK: %[[NEW_MEMREF_2:.*]] = memref.alloc() : memref<6144xf16, 3>
+    // CHECK: %[[OFF_X:.*]] = affine.apply #map(%[[THREAD_X]], %[[THREAD_Y]], %[[THREAD_Z]])
+    // CHECK: %[[NEW_MEMREF_3:.*]] = memref.subview %[[NEW_MEMREF_2]][%[[OFF_X]]] [256] [1]
+    // CHECK-SAME: memref<6144xf16, 3> to memref<256xf16, strided<[1], offset: ?>, 3>
+    %2 = memref.alloc() : memref<256xf16>
+
+    // CHECK: linalg.add ins(%[[NEW_MEMREF_1]], %[[NEW_MEMREF_3]] :
+    // CHECK-SAME: memref<256xf16, 1>, memref<256xf16, strided<[1], offset: ?>, 3>) outs(%[[NEW_MEMREF_0]] : memref<256xf16>)
+    linalg.add ins(%1, %2 :memref<256xf16, 1>, memref<256xf16>) outs(%0 : memref<256xf16>)
+    // CHECK: memref.dealloc %[[NEW_MEMREF_1]] : memref<256xf16, 1>
+    // Verify that there are no deallocs for SLM
+    // CHECK-NOT: memref.dealloc %[[NEW_MEMREF_2]] .*
+    // CHECK-NOT: memref.dealloc %[[NEW_MEMREF_3]] .*
+    memref.dealloc %1 : memref<256xf16, 1>
+    memref.dealloc %2 : memref<256xf16>
+    gpu.terminator
+  }
+  return
+}
diff --git a/test/mlir/test/gc/Transforms/GPU/allocs-to-slm-2d.mlir b/test/mlir/test/gc/Transforms/GPU/allocs-to-slm-2d.mlir
@@ -1,32 +1,35 @@
 // RUN: gc-opt %s --allocs-to-slm | FileCheck %s
 
+// Computex thread offset for SLM: (Xthread_idx * Yblock_sz * Zblock_sz + Ythread_idx * Zblock_sz + Zthread_idx) * Xchunk_size
+// CHECK: #map = affine_map<(d0, d1, d2) -> ((d0 * 12 + d1 * 4 + d2) * 16)>
+
 func.func @entry() {
   %c1 = arith.constant 1 : index
   %c2 = arith.constant 2 : index
+  %c3 = arith.constant 3 : index
   %c4 = arith.constant 4 : index
 
   // Memory space wasn't assigned as it's allocated outside of gpu.launch block
   // CHECK: %[[NEW_MEMREF_0:.*]] = memref.alloc() : memref<16x32xf16>
   %0 = memref.alloc() : memref<16x32xf16>
   // Capture thread-id variables
   // CHECK: gpu.launch blocks(%[[ARG0:.+]], %[[ARG1:.+]], %[[ARG2:.+]]) in (%[[ARG6:.+]] = %c2, %[[ARG7:.+]] = %c2, %[[ARG8:.+]] = %c1) threads
-  // CHECK-SAME: (%[[THREAD_X:.+]], %[[THREAD_Y:.+]], %[[ARG5:.+]]) in
-  // CHECK-SAME: (%[[ARG9:.+]] = %c2, %[[ARG10:.+]] = %c4, %[[ARG11:.+]] = %c1) {
+  // CHECK-SAME: (%[[THREAD_X:.+]], %[[THREAD_Y:.+]], %[[THREAD_Z:.+]]) in
+  // CHECK-SAME: (%[[ARG9:.+]] = %c2, %[[ARG10:.+]] = %c3, %[[ARG11:.+]] = %c4) {
   gpu.launch blocks(%bx, %by, %bz) in (%sz_bx = %c2, %sz_by = %c2, %sz_bz = %c1)
-             threads(%tx, %ty, %tz) in (%sz_tx = %c2, %sz_ty = %c4, %sz_tz = %c1) {
+             threads(%tx, %ty, %tz) in (%sz_tx = %c2, %sz_ty = %c3, %sz_tz = %c4) {
     // Memory space was changed as it's explicitly specifided
     // CHECK: %[[NEW_MEMREF_1:.*]] = memref.alloc() : memref<16x32xf16, 1>
     %1 = memref.alloc() : memref<16x32xf16, 1>
-    // Added 'shared' memory space
-    // CHECK: %[[NEW_MEMREF_2:.*]] = memref.alloc() : memref<32x128xf16, 3>
-    // CHECK: %[[OFF_X:.*]] = arith.muli %[[THREAD_X]], %c16 : index
-    // CHECK: %[[OFF_Y:.*]] = arith.muli %[[THREAD_Y]], %c32 : index
-    // CHECK: %[[NEW_MEMREF_3:.*]] = memref.subview %[[NEW_MEMREF_2]][%[[OFF_X]], %[[OFF_Y]]] [16, 32] [1, 1]
-    // CHECK-SAME: memref<32x128xf16, 3> to memref<16x32xf16, strided<[128, 1], offset: ?>, 3>
+    // Added 'shared' memory space and allocated SLM for each thread (2 * 3 * 4 = 24; 24 * 16 = 384)
+    // CHECK: %[[NEW_MEMREF_2:.*]] = memref.alloc() : memref<384x32xf16, 3>
+    // CHECK: %[[OFF_X:.*]] = affine.apply #map(%[[THREAD_X]], %[[THREAD_Y]], %[[THREAD_Z]])
+    // CHECK: %[[NEW_MEMREF_3:.*]] = memref.subview %[[NEW_MEMREF_2]][%[[OFF_X]], 0] [16, 32] [1, 1]
+    // CHECK-SAME: memref<384x32xf16, 3> to memref<16x32xf16, strided<[32, 1], offset: ?>, 3>
     %2 = memref.alloc() : memref<16x32xf16>
 
     // CHECK: linalg.add ins(%[[NEW_MEMREF_1]], %[[NEW_MEMREF_3]] :
-    // CHECK-SAME: memref<16x32xf16, 1>, memref<16x32xf16, strided<[128, 1], offset: ?>, 3>) outs(%[[NEW_MEMREF_0]] : memref<16x32xf16>)
+    // CHECK-SAME: memref<16x32xf16, 1>, memref<16x32xf16, strided<[32, 1], offset: ?>, 3>) outs(%[[NEW_MEMREF_0]] : memref<16x32xf16>)
     linalg.add ins(%1, %2 :memref<16x32xf16, 1>, memref<16x32xf16>) outs(%0 : memref<16x32xf16>)
     // CHECK: memref.dealloc %[[NEW_MEMREF_1]] : memref<16x32xf16, 1>
     // Verify that there are no deallocs for SLM
diff --git a/test/mlir/test/gc/Transforms/GPU/allocs-to-slm-4d.mlir b/test/mlir/test/gc/Transforms/GPU/allocs-to-slm-4d.mlir
@@ -0,0 +1,43 @@
+// RUN: gc-opt %s --allocs-to-slm | FileCheck %s
+
+// Computex thread offset for SLM: (Xthread_idx * Yblock_sz * Zblock_sz + Ythread_idx * Zblock_sz + Zthread_idx) * Xchunk_size
+// CHECK: #map = affine_map<(d0, d1, d2) -> ((d0 * 12 + d1 * 4 + d2) * 2)>
+
+func.func @entry() {
+  %c1 = arith.constant 1 : index
+  %c2 = arith.constant 2 : index
+  %c3 = arith.constant 3 : index
+  %c4 = arith.constant 4 : index
+
+  // Memory space wasn't assigned as it's allocated outside of gpu.launch block
+  // CHECK: %[[NEW_MEMREF_0:.*]] = memref.alloc() : memref<2x3x16x32xf16>
+  %0 = memref.alloc() : memref<2x3x16x32xf16>
+  // Capture thread-id variables
+  // CHECK: gpu.launch blocks(%[[ARG0:.+]], %[[ARG1:.+]], %[[ARG2:.+]]) in (%[[ARG6:.+]] = %c2, %[[ARG7:.+]] = %c2, %[[ARG8:.+]] = %c1) threads
+  // CHECK-SAME: (%[[THREAD_X:.+]], %[[THREAD_Y:.+]], %[[THREAD_Z:.+]]) in
+  // CHECK-SAME: (%[[ARG9:.+]] = %c2, %[[ARG10:.+]] = %c3, %[[ARG11:.+]] = %c4) {
+  gpu.launch blocks(%bx, %by, %bz) in (%sz_bx = %c2, %sz_by = %c2, %sz_bz = %c1)
+             threads(%tx, %ty, %tz) in (%sz_tx = %c2, %sz_ty = %c3, %sz_tz = %c4) {
+    // Memory space was changed as it's explicitly specifided
+    // CHECK: %[[NEW_MEMREF_1:.*]] = memref.alloc() : memref<2x3x16x32xf16, 1>
+    %1 = memref.alloc() : memref<2x3x16x32xf16, 1>
+    // Added 'shared' memory space and allocated SLM for each thread (2 * 3 * 4 = 24; 24 * 2 = 48)
+    // CHECK: %[[NEW_MEMREF_2:.*]] = memref.alloc() : memref<48x3x16x32xf16, 3>
+    // CHECK: %[[OFF_X:.*]] = affine.apply #map(%[[THREAD_X]], %[[THREAD_Y]], %[[THREAD_Z]])
+    // CHECK: %[[NEW_MEMREF_3:.*]] = memref.subview %[[NEW_MEMREF_2]][%[[OFF_X]], 0, 0, 0] [2, 3, 16, 32] [1, 1, 1, 1]
+    // CHECK-SAME: memref<48x3x16x32xf16, 3> to memref<2x3x16x32xf16, strided<[1536, 512, 32, 1], offset: ?>, 3>
+    %2 = memref.alloc() : memref<2x3x16x32xf16>
+
+    // CHECK: linalg.add ins(%[[NEW_MEMREF_1]], %[[NEW_MEMREF_3]] :
+    // CHECK-SAME: memref<2x3x16x32xf16, 1>, memref<2x3x16x32xf16, strided<[1536, 512, 32, 1], offset: ?>, 3>) outs(%[[NEW_MEMREF_0]] : memref<2x3x16x32xf16>)
+    linalg.add ins(%1, %2 :memref<2x3x16x32xf16, 1>, memref<2x3x16x32xf16>) outs(%0 : memref<2x3x16x32xf16>)
+    // CHECK: memref.dealloc %[[NEW_MEMREF_1]] : memref<2x3x16x32xf16, 1>
+    // Verify that there are no deallocs for SLM
+    // CHECK-NOT: memref.dealloc %[[NEW_MEMREF_2]] .*
+    // CHECK-NOT: memref.dealloc %[[NEW_MEMREF_3]] .*
+    memref.dealloc %1 : memref<2x3x16x32xf16, 1>
+    memref.dealloc %2 : memref<2x3x16x32xf16>
+    gpu.terminator
+  }
+  return
+}