fixup! [mlir][linalg] Refactor vectorization hooks to improve code reuse

banach-space · banach-space · commit cf4a2c51607a · 2025-05-29T20:00:54.000+01:00
* Restore the original behaviour in `vectorizeAsInsertSliceOp`, whereby
  the `in_bounds` attribute was used to identify potentially
  out-of-bounds accesses. Masks are only used when input vector sizes
  are specified.
* Revert the changes in insert-slice-with-patterns.mlir and
  pad-with-patterns.mlir, i.e. the tests in which we don't specify
  vector sizes.
* Other minor updates.
diff --git a/mlir/lib/Dialect/Linalg/Transforms/Vectorization.cpp b/mlir/lib/Dialect/Linalg/Transforms/Vectorization.cpp
@@ -1659,9 +1659,10 @@ createWriteOrMaskedWrite(OpBuilder &builder, Location loc, Value vectorToStore,
                static_cast<size_t>(vecToStoreType.getRank()) &&
            "Insufficient number of input vector sizes!");
     // Update the inBounds attribute.
-    for (unsigned i = 0; i < destRank; i++)
-      inBoundsVal[i] = (destShape[i] == inputVecSizesForLeadingDims[i]) &&
-                       !ShapedType::isDynamic(destShape[i]);
+    for (unsigned i = 0; i < vecToStoreRank; i++)
+      inBoundsVal[i] =
+          (destShape[i] == inputVecSizesForLeadingDims[i]) &&
+          !ShapedType::isDynamic(destShape[destRank - vecToStoreRank + i]);
   }
 
   // If missing, initialize the write indices to 0.
@@ -1670,7 +1671,7 @@ createWriteOrMaskedWrite(OpBuilder &builder, Location loc, Value vectorToStore,
              "Invalid number of write indices!");
   if (writeIndices.empty()) {
     auto zero = builder.create<arith::ConstantIndexOp>(loc, 0);
-    writeIndices = SmallVector<Value>(destRank, zero);
+    writeIndices.assign(destRank, zero);
   }
 
   // Generate the xfer_write Op
@@ -1826,8 +1827,7 @@ vectorizeAsTensorPackOp(RewriterBase &rewriter, linalg::PackOp packOp,
       transposeOp.getResult().getType().getElementType());
   Operation *write = createWriteOrMaskedWrite(
       rewriter, loc, transposeOp.getResult(), dest,
-      /*inputVecSizesForLeadingDims=*/inputVectorSizes, /*writeIndices=*/{},
-      /*useInBoundsInsteadOfMasking=*/false);
+      /*inputVecSizesForLeadingDims=*/inputVectorSizes);
   newResults.push_back(write->getResult(0));
   return success();
 }
@@ -2000,8 +2000,7 @@ vectorizeAsTensorPadOp(RewriterBase &rewriter, tensor::PadOp padOp,
       loc, reifiedReturnShapes[0], padOp.getResultType().getElementType());
   Operation *write = createWriteOrMaskedWrite(
       rewriter, loc, maskedRead, dest,
-      /*inputVecSizesForLeadingDims=*/inputVectorSizes, {},
-      /*useInBoundsInsteadOfMasking=*/false);
+      /*inputVecSizesForLeadingDims=*/inputVectorSizes);
   newResults.push_back(write->getResult(0));
   return success();
 }
@@ -3007,39 +3006,24 @@ vectorizeAsInsertSliceOp(RewriterBase &rewriter, tensor::InsertSliceOp sliceOp,
         sliceOp.getLoc(), elemType, rewriter.getZeroAttr(elemType));
   }
 
-  // 2. Get the vector shape and in-bounds attributes
+  // 2. Get the vector shape
   SmallVector<int64_t> vecShape;
-  SmallVector<bool> readInBounds;
-  SmallVector<bool> writeInBounds;
   size_t rankDiff = resultType.getRank() - sourceType.getRank();
   for (int64_t i = 0, end = sourceType.getRank(); i < end; ++i) {
     if (!inputVectorSizes.empty()) {
       vecShape.push_back(inputVectorSizes[i]);
-      readInBounds.push_back(false);
-      writeInBounds.push_back(false);
     } else if (!sourceType.isDynamicDim(i)) {
       vecShape.push_back(sourceType.getDimSize(i));
-      // Source shape is statically known: Neither read nor write are
-      // out-of-bounds.
-      readInBounds.push_back(true);
-      writeInBounds.push_back(true);
     } else if (!resultType.isDynamicDim(i)) {
       // Source shape is not statically known, but result shape is.
       // Vectorize with size of result shape. This may be larger than the
       // source size.
       // FIXME: Using rankDiff implies that the source tensor is inserted at
       // the end of the destination tensor. However, that's not required.
       vecShape.push_back(resultType.getDimSize(rankDiff + i));
-      // Read may be out-of-bounds because the result size could be larger
-      // than the source size.
-      readInBounds.push_back(false);
-      // Write will be in-bounds provided that the corresponding write idx is 0.
-      // To keep this logic simple, conservatively mark as out-of-bounds.
-      writeInBounds.push_back(false);
     } else {
       // Neither source nor result dim of padOp is static. Cannot vectorize
       // the copy.
-      // TODO: Add support for masking
       return failure();
     }
   }
@@ -3052,13 +3036,15 @@ vectorizeAsInsertSliceOp(RewriterBase &rewriter, tensor::InsertSliceOp sliceOp,
   SmallVector<Value> readIndices(
       vecType.getRank(), rewriter.create<arith::ConstantIndexOp>(loc, 0));
   Value read = mlir::vector::createReadOrMaskedRead(
-      rewriter, loc, source, vecType.getShape(), padValue);
+      rewriter, loc, source, vecType.getShape(), padValue,
+      /*useInBoundsInsteadOfMasking=*/inputVectorSizes.empty());
 
   // Create write
   auto writeIndices =
       getValueOrCreateConstantIndexOp(rewriter, loc, sliceOp.getMixedOffsets());
   Operation *write = createWriteOrMaskedWrite(
-      rewriter, loc, read, sliceOp.getDest(), vecType.getShape(), writeIndices);
+      rewriter, loc, read, sliceOp.getDest(), vecType.getShape(), writeIndices,
+      /*useInBoundsInsteadOfMasking=*/inputVectorSizes.empty());
 
   // 4. Finalize
   newResults.push_back(write->getResult(0));
diff --git a/mlir/test/Dialect/Linalg/vectorization/insert-slice-with-patterns.mlir b/mlir/test/Dialect/Linalg/vectorization/insert-slice-with-patterns.mlir
@@ -67,19 +67,10 @@ module attributes {transform.with_named_sequence} {
 // CHECK-SAME:      %[[ARG_0:.*]]: tensor<1x?x3xf32>,
 // CHECK-SAME:      %[[PAD:.*]]: f32,
 // CHECK-SAME:      %[[SIZE:.*]]: index) -> tensor<9x8x7x1x2x3xf32> {
-// CHECK:           %[[C3:.*]] = arith.constant 3 : index
-// CHECK:           %[[C1:.*]] = arith.constant 1 : index
-// CHECK:           %[[C0:.*]] = arith.constant 0 : index
 // CHECK:           %[[EMPTY:.*]] = tensor.empty() : tensor<9x8x7x1x2x3xf32>
 // CHECK:           %[[BC:.*]] = vector.broadcast %[[PAD]] : f32 to vector<9x8x7x1x2x3xf32>
 // CHECK:           %[[WRITE:.*]] = vector.transfer_write %[[BC]], %[[EMPTY]]{{.*}} {in_bounds = [true, true, true, true, true, true]} : vector<9x8x7x1x2x3xf32>, tensor<9x8x7x1x2x3xf32>
-
-// CHECK:           %[[D1:.*]] = tensor.dim %[[ARG_0]], %[[C1]] : tensor<1x?x3xf32>
-// CHECK:           %[[MASK:.*]] = vector.create_mask %[[C1]], %[[D1]], %[[C3]] : vector<1x2x3xi1>
-// CHECK:           %[[READ:.*]] = vector.mask %[[MASK]] { 
-// CHECK-SAME:        vector.transfer_read %[[ARG_0]][%[[C0]], %[[C0]], %[[C0]]], %[[PAD]] {in_bounds = [true, true, true]} : tensor<1x?x3xf32>, vector<1x2x3xf32>
-// CHECK-SAME:      } : vector<1x2x3xi1> -> vector<1x2x3xf32>
-
+// CHECK:           %[[READ:.*]] = vector.transfer_read %[[ARG_0]]{{.*}}, %[[PAD]] {in_bounds = [true, false, true]} : tensor<1x?x3xf32>, vector<1x2x3xf32>
 // CHECK:           %[[RES:.*]] = vector.transfer_write %[[READ]], %[[WRITE]]{{.*}} {in_bounds = [true, true, true]} : vector<1x2x3xf32>, tensor<9x8x7x1x2x3xf32>
 // CHECK:           return %[[RES]] : tensor<9x8x7x1x2x3xf32>
 func.func @insert_dynamic_slice_non_zero_pad(%arg0: tensor<1x?x3xf32>, %pad : f32, %size: index) -> tensor<9x8x7x1x2x3xf32> {
diff --git a/mlir/test/Dialect/Linalg/vectorization/pad-with-patterns.mlir b/mlir/test/Dialect/Linalg/vectorization/pad-with-patterns.mlir
@@ -5,23 +5,16 @@
 ///----------------------------------------------------------------------------------------
 
 // CHECK-LABEL: func @pad_static(
-// CHECK-SAME:    %[[ARG0:.*]]: tensor<2x?x2xf32>,
-// CHECK-SAME:    %[[ARG1:.*]]: f32) -> tensor<2x3x4xf32> {
-//  CHECK-DAG:    %[[C2:.*]] = arith.constant 2 : index
-//  CHECK-DAG:    %[[C1:.*]] = arith.constant 1 : index
-//  CHECK-DAG:    %[[C0:.*]] = arith.constant 0 : index
-//      CHECK:    %[[EMPTY:.*]] = tensor.empty() : tensor<2x3x4xf32>
-//      CHECK:    %[[INIT:.*]] = vector.broadcast %[[ARG1]] : f32 to vector<2x3x4xf32>
-//      CHECK:    %[[OUT_TENSOR:.*]] = vector.transfer_write %[[INIT]], %[[EMPTY]]{{\[}}%[[C0]], %[[C0]], %[[C0]]] {in_bounds = [true, true, true]} : vector<2x3x4xf32>, tensor<2x3x4xf32>
-//      CHECK:    %[[DIM_1:.*]] = tensor.dim %[[ARG0]], %[[C1]] : tensor<2x?x2xf32>
-//      CHECK:    %[[MASK_READ:.*]] = vector.create_mask %[[C2]], %[[DIM_1]], %[[C2]] : vector<2x3x2xi1>
-//      CHECK:    %[[READ:.*]] = vector.mask %[[MASK_READ]] {
-// CHECK-SAME:      vector.transfer_read %[[ARG0]]{{\[}}%[[C0]], %[[C0]], %[[C0]]], %[[ARG1]]
-// CHECK-SAME:      {in_bounds = [true, true, true]} : tensor<2x?x2xf32>, vector<2x3x2xf32>
-// CHECK-SAME:    } : vector<2x3x2xi1> -> vector<2x3x2xf32>
-//      CHECK:    %[[RESULT:.*]] = vector.transfer_write %[[READ]], %[[OUT_TENSOR]]{{\[}}%[[C0]], %[[C0]], %[[C2]]]
-// CHECK-SAME:      {in_bounds = [true, true, true]} : vector<2x3x2xf32>, tensor<2x3x4xf32> 
-//      CHECK:    return %[[RESULT]] : tensor<2x3x4xf32>
+//  CHECK-SAME:                  %[[ARG0:.*]]: tensor<2x?x2xf32>, %[[PAD:.*]]: f32
+//   CHECK-NOT:   tensor.pad
+//   CHECK-DAG:   %[[C0:.*]] = arith.constant 0 : index
+//   CHECK-DAG:   %[[C2:.*]] = arith.constant 2 : index
+//   CHECK-DAG:   %[[INIT:.*]] = tensor.empty() : tensor<2x3x4xf32>
+//   CHECK-DAG:   %[[VEC:.*]] = vector.broadcast %[[PAD]] : f32 to vector<2x3x4xf32>
+//       CHECK:   %[[FILL:.*]] = vector.transfer_write %[[VEC]], %[[INIT]]{{.*}} : vector<2x3x4xf32>, tensor<2x3x4xf32>
+//       CHECK:   %[[READ:.*]] = vector.transfer_read %[[ARG0]][%[[C0]], %[[C0]], %[[C0]]], %[[PAD]] {in_bounds = [true, false, true]} : tensor<2x?x2xf32>, vector<2x3x2xf32>
+//       CHECK:   %[[RESULT:.*]] = vector.transfer_write %[[READ]], %[[FILL]][%[[C0]], %[[C0]], %[[C2]]] {in_bounds = [true, true, true]} : vector<2x3x2xf32>, tensor<2x3x4xf32>
+//       CHECK:   return %[[RESULT]]
 func.func @pad_static(%arg0: tensor<2x?x2xf32>, %pad_value: f32) -> tensor<2x3x4xf32> {
   %0 = tensor.pad %arg0 low[0, 0, 2] high[0, 1, 0] {
     ^bb0(%arg1: index, %arg2: index, %arg3: index):