[mlir][linalg] Add support for masked vectorization of tensor.insert_slice (2/N)

banach-space · banach-space · commit 980280381b2d · 2025-02-02T13:59:58.000Z
For context, recall that `tensor.insert_slice` is vectorised using the `vector.transfer_read` + `vector.transfer_write` pair. An unmasked example is shown below: ```mlir // BEFORE VECTORIZATION %res = tensor.insert_slice %slice into %dest[0, %c2] [5, 1] [1, 1] : tensor<5x1xi32> into tensor<5x3xi32> // AFTER VECTORIZATION %read = vector.transfer_read %source[%c0, %c0], %pad : tensor<5x1xi32>, vector<8x1xi32> %res = vector.transfer_write %read, %dest[%c0, %c2] : vector<8x1xi32>, tensor<5x3xi32> ``` This PR extends `vectorizeAsInsertSliceOp` to add masking support for the `vector.transfer_write` operation. This complements the changes in #122927, which introduced masking for the `vector.transfer_read`.
diff --git a/mlir/lib/Dialect/Linalg/Transforms/Vectorization.cpp b/mlir/lib/Dialect/Linalg/Transforms/Vectorization.cpp
@@ -2716,56 +2716,60 @@ vectorizeAsInsertSliceOp(RewriterBase &rewriter, tensor::InsertSliceOp sliceOp,
   }
   auto vecType = VectorType::get(vecShape, sourceType.getElementType());
 
-  // 3. Generate TransferReadOp.
-  SmallVector<Value> readIndices(
-      vecType.getRank(),
-      rewriter.create<arith::ConstantIndexOp>(sliceOp.getLoc(), 0));
-  Operation *read = rewriter.create<vector::TransferReadOp>(
-      sliceOp.getLoc(), vecType, source, readIndices, padValue,
-      ArrayRef<bool>{readInBounds});
+  // 3. Generate TransferReadOp + TransferWriteOp
+  ReifiedRankedShapedTypeDims reifiedSrcSizes;
+  Value maskOp;
 
-  // If vector sizes are user provided, make sure to mask xfer_read.
+  // If vector sizes are user provided, make sure to mask. First, generate the
+  // mask.
   if (!inputVectorSizes.empty()) {
     auto *srcDefOp = source.getDefiningOp();
     if (!srcDefOp) {
       LDBG("Unable to get the defining Op of " << sliceOp);
       return failure();
     }
 
-    ReifiedRankedShapedTypeDims reifiedSrcSizes;
     LogicalResult status =
         cast<ReifyRankedShapedTypeOpInterface>(srcDefOp).reifyResultShapes(
             rewriter, reifiedSrcSizes);
     if (status.failed()) {
-      LDBG("Unable to reify result shapes of " << sliceOp);
+      LDBG("Unable to reify result shapes of " << srcDefOp);
       return failure();
     }
 
     // Create the mask
-    SmallVector<int64_t> readMaskShape(
-        sliceOp.getSource().getType().getShape());
     auto readMaskType = VectorType::get(inputVectorSizes, rewriter.getI1Type());
-    Value maskOp = rewriter.create<vector::CreateMaskOp>(
+    maskOp = rewriter.create<vector::CreateMaskOp>(
         sliceOp.getLoc(), readMaskType, reifiedSrcSizes[0]);
-
-    // Mask the xfer_read Op
-    read = mlir::vector::maskOperation(rewriter, read, maskOp);
   }
 
-  // 4. Generate TransferWriteOp.
-  if (!inputVectorSizes.empty() &&
-      ShapedType::isDynamicShape(resultType.getShape())) {
-    LDBG("TODO: Masking of xfer_write when vectorising " << sliceOp);
-    return failure();
+  // 3.a. TransferReadOp
+  SmallVector<Value> readIndices(
+      vecType.getRank(),
+      rewriter.create<arith::ConstantIndexOp>(sliceOp.getLoc(), 0));
+  Operation *read = rewriter.create<vector::TransferReadOp>(
+      sliceOp.getLoc(), vecType, source, readIndices, padValue,
+      ArrayRef<bool>{readInBounds});
+
+  // Mask the xfer_read Op
+  if (!inputVectorSizes.empty()) {
+    read = mlir::vector::maskOperation(rewriter, read, maskOp);
   }
 
+  // 3.b. TransferWriteOp
   auto writeIndices = getValueOrCreateConstantIndexOp(
       rewriter, sliceOp.getLoc(), sliceOp.getMixedOffsets());
 
-  // 5. Finalize
   Operation *write = rewriter.create<vector::TransferWriteOp>(
       sliceOp.getLoc(), read->getResult(0), sliceOp.getDest(), writeIndices,
       ArrayRef<bool>{writeInBounds});
+
+  // Mask the xfer_write Op
+  if (!inputVectorSizes.empty()) {
+    write = mlir::vector::maskOperation(rewriter, write, maskOp);
+  }
+
+  // 4. Finalize
   newResults.push_back(write->getResult(0));
 
   return success();
diff --git a/mlir/test/Dialect/Linalg/vectorization-unsupported.mlir b/mlir/test/Dialect/Linalg/vectorization-unsupported.mlir
@@ -280,26 +280,3 @@ module attributes {transform.with_named_sequence} {
     transform.yield
   }
 }
-
-// -----
-
-// One of the _destination_ dimensions is dynamic (but _source_ dimensions are static).
-
-func.func private @insert_slice_dynamic_dest_dim(%source: tensor<?x3x?x1xi32>, %size: index) -> tensor<?x3xi32> {
-  %c2 = arith.constant 2 : index
-  %init = tensor.empty(%size) : tensor<?x3xi32>
-
-  %source_slice = tensor.extract_slice %source[0, %c2, 0, 0] [1, 1, 5, 1] [1, 1, 1, 1] : tensor<?x3x?x1xi32> to tensor<5x1xi32>
-  // expected-error @+1 {{Attempted to vectorize, but failed}}
-  %res = tensor.insert_slice %source_slice into %init[0, %c2] [5, 1] [1, 1] : tensor<5x1xi32> into tensor<?x3xi32>
-
-  return %res : tensor<?x3xi32>
-}
-
- module attributes {transform.with_named_sequence} {
-  transform.named_sequence @__transform_main(%arg0: !transform.any_op {transform.readonly}) {
-    %0 = transform.structured.match ops{["tensor.insert_slice"]} in %arg0 : (!transform.any_op) -> !transform.any_op
-    transform.structured.vectorize %0 vector_sizes [8, 1] : !transform.any_op
-    transform.yield
-  } 
- }
diff --git a/mlir/test/Dialect/Linalg/vectorization.mlir b/mlir/test/Dialect/Linalg/vectorization.mlir
@@ -1130,14 +1130,14 @@ func.func private @insert_slice_static_sizes(%source: tensor<?x3x?x1xi32>) -> te
 // CHECK:           %[[C_2:.*]] = arith.constant 2 : index
 // CHECK:           %[[INIT:.*]] = tensor.empty() : tensor<5x3xi32>
 // CHECK:           %[[SRC_SLICE:.*]] = tensor.extract_slice %[[SEC]][0, %[[C_2]], 0, 0] [1, 1, 5, 1] [1, 1, 1, 1] : tensor<?x3x?x1xi32> to tensor<5x1xi32>
-// CHECK:           %[[PAD:.*]] = arith.constant 0 : i32
-// CHECK:           %[[C0:.*]] = arith.constant 0 : index
-// CHECK:           %[[C_5:.*]] = arith.constant 5 : index
-// CHECK:           %[[C_1:.*]] = arith.constant 1 : index
+// CHECK-DAG:       %[[PAD:.*]] = arith.constant 0 : i32
+// CHECK-DAG:       %[[C_5:.*]] = arith.constant 5 : index
+// CHECK-DAG:       %[[C_1:.*]] = arith.constant 1 : index
 // CHECK:           %[[MASK:.*]] = vector.create_mask %[[C_5]], %[[C_1]] : vector<8x1xi1>
+// CHECK:           %[[C0:.*]] = arith.constant 0 : index
 // CHECK:           %[[READ:.*]] = vector.mask %[[MASK]] { vector.transfer_read %[[SRC_SLICE]][%[[C0]], %[[C0]]], %[[PAD]] : tensor<5x1xi32>, vector<8x1xi32> } : vector<8x1xi1> -> vector<8x1xi32>
 // CHECK:           %[[C_0:.*]] = arith.constant 0 : index
-// CHECK:           %[[RES:.*]] = vector.transfer_write %[[READ]], %[[INIT]][%[[C_0]], %[[C_2]]] : vector<8x1xi32>, tensor<5x3xi32>
+// CHECK:           %[[RES:.*]] = vector.mask %[[MASK]] { vector.transfer_write %[[READ]], %[[INIT]][%[[C_0]], %[[C_2]]] : vector<8x1xi32>, tensor<5x3xi32> } : vector<8x1xi1> -> tensor<5x3xi32>
 // CHECK:           return %[[RES]] : tensor<5x3xi32>
 
  module attributes {transform.with_named_sequence} {
@@ -1170,11 +1170,11 @@ func.func private @insert_slice_dynamic_src_dim(%source: tensor<?x3x?x1xi32>, %s
 // CHECK:           %[[SRC_SLICE:.*]] = tensor.extract_slice %[[SRC]][0, %[[C_2]], 0, 0] [1, 1, %[[SIZE]], 1] [1, 1, 1, 1] : tensor<?x3x?x1xi32> to tensor<?x1xi32>
 // CHECK-DAG:       %[[PAD:.*]] = arith.constant 0 : i32
 // CHECK-DAG:       %[[C_1:.*]] = arith.constant 1 : index
-// CHECK-DAG:       %[[C_0:.*]] = arith.constant 0 : index
 // CHECK:           %[[MASK:.*]] = vector.create_mask %[[SIZE]], %[[C_1]] : vector<8x1xi1>
+// CHECK:           %[[C_0:.*]] = arith.constant 0 : index
 // CHECK:           %[[READ:.*]] = vector.mask %[[MASK]] { vector.transfer_read %[[SRC_SLICE]][%[[C_0]], %[[C_0]]], %[[PAD]] : tensor<?x1xi32>, vector<8x1xi32> } : vector<8x1xi1> -> vector<8x1xi32>
 // CHECK:           %[[C_0_1:.*]] = arith.constant 0 : index
-// CHECK:           %[[RES:.*]] = vector.transfer_write %[[READ]], %[[INIT]][%[[C_0_1]], %[[C_2]]] : vector<8x1xi32>, tensor<5x3xi32>
+// CHECK:           %[[RES:.*]] = vector.mask %[[MASK]] { vector.transfer_write %[[READ]], %[[INIT]][%[[C_0_1]], %[[C_2]]] : vector<8x1xi32>, tensor<5x3xi32> } : vector<8x1xi1> -> tensor<5x3xi32>
 // CHECK:           return %[[RES]] : tensor<5x3xi32>
 
  module attributes {transform.with_named_sequence} {
@@ -1184,3 +1184,78 @@ func.func private @insert_slice_dynamic_src_dim(%source: tensor<?x3x?x1xi32>, %s
     transform.yield
   }
  }
+
+// -----
+
+// One of the _destination_ dimensions is dynamic (but _source_ dimensions are static).
+
+func.func private @insert_slice_dynamic_dest_dim(%source: tensor<?x3x?x1xi32>, %size: index) -> tensor<?x3xi32> {
+  %c2 = arith.constant 2 : index
+  %init = tensor.empty(%size) : tensor<?x3xi32>
+
+  %source_slice = tensor.extract_slice %source[0, %c2, 0, 0] [1, 1, 5, 1] [1, 1, 1, 1] : tensor<?x3x?x1xi32> to tensor<5x1xi32>
+  %res = tensor.insert_slice %source_slice into %init[0, %c2] [5, 1] [1, 1] : tensor<5x1xi32> into tensor<?x3xi32>
+
+  return %res : tensor<?x3xi32>
+}
+
+// CHECK-LABEL:   func.func private @insert_slice_dynamic_dest_dim(
+// CHECK-SAME:      %[[SRC:.*]]: tensor<?x3x?x1xi32>,
+// CHECK-SAME:      %[[size:.*]]: index) -> tensor<?x3xi32> {
+// CHECK:           %[[C_2:.*]] = arith.constant 2 : index
+// CHECK:           %[[INIT:.*]] = tensor.empty(%[[size]]) : tensor<?x3xi32>
+// CHECK:           %[[SRC_SLICE:.*]] = tensor.extract_slice %[[SRC]][0, %[[C_2]], 0, 0] [1, 1, 5, 1] [1, 1, 1, 1] : tensor<?x3x?x1xi32> to tensor<5x1xi32>
+// CHECK:           %[[PAD:.*]] = arith.constant 0 : i32
+// CHECK:           %[[C_5:.*]] = arith.constant 5 : index
+// CHECK:           %[[C_1:.*]] = arith.constant 1 : index
+// CHECK:           %[[MASK:.*]] = vector.create_mask %[[C_5]], %[[C_1]] : vector<8x1xi1>
+// CHECK:           %[[C_0:.*]] = arith.constant 0 : index
+// CHECK:           %[[READ:.*]] = vector.mask %[[MASK]] { vector.transfer_read %[[SRC_SLICE]][%[[C_0]], %[[C_0]]], %[[PAD]] : tensor<5x1xi32>, vector<8x1xi32> } : vector<8x1xi1> -> vector<8x1xi32>
+// CHECK:           %[[C_0_1:.*]] = arith.constant 0 : index
+// CHECK:           %[[WRITE:.*]] = vector.mask %[[MASK]] { vector.transfer_write %[[READ]], %[[INIT]][%[[C_0_1]], %[[C_2]]] : vector<8x1xi32>, tensor<?x3xi32> } : vector<8x1xi1> -> tensor<?x3xi32>
+// CHECK:           return %[[WRITE]] : tensor<?x3xi32>
+
+ module attributes {transform.with_named_sequence} {
+  transform.named_sequence @__transform_main(%arg0: !transform.any_op {transform.readonly}) {
+    %0 = transform.structured.match ops{["tensor.insert_slice"]} in %arg0 : (!transform.any_op) -> !transform.any_op
+    transform.structured.vectorize %0 vector_sizes [8, 1] : !transform.any_op
+    transform.yield
+  }
+ }
+
+// -----
+
+// At least one _source_ and one _destination_ dimensions are dynamic.
+
+func.func private @insert_slice_dynamic_source_and_dest_dim(%source: tensor<?x3x?x1xi32>, %size: index) -> tensor<?x3xi32> {
+  %c2 = arith.constant 2 : index
+  %init = tensor.empty(%size) : tensor<?x3xi32>
+
+  %source_slice = tensor.extract_slice %source[0, %c2, 0, 0] [1, 1, %size, 1] [1, 1, 1, 1] : tensor<?x3x?x1xi32> to tensor<?x1xi32>
+  %res = tensor.insert_slice %source_slice into %init[0, %c2] [%size, 1] [1, 1] : tensor<?x1xi32> into tensor<?x3xi32>
+
+  return %res : tensor<?x3xi32>
+}
+
+// CHECK-LABEL:   func.func private @insert_slice_dynamic_source_and_dest_dim(
+// CHECK-SAME:      %[[SRC:.*]]: tensor<?x3x?x1xi32>,
+// CHECK-SAME:      %[[SIZE:.*]]: index) -> tensor<?x3xi32> {
+// CHECK:           %[[C_2:.*]] = arith.constant 2 : index
+// CHECK:           %[[INIT:.*]] = tensor.empty(%[[SIZE]]) : tensor<?x3xi32>
+// CHECK:           %[[SRC_SIZE:.*]] = tensor.extract_slice %[[SRC]][0, %[[C_2]], 0, 0] [1, 1, %[[SIZE]], 1] [1, 1, 1, 1] : tensor<?x3x?x1xi32> to tensor<?x1xi32>
+// CHECK:           %[[PAD:.*]] = arith.constant 0 : i32
+// CHECK:           %[[C1:.*]] = arith.constant 1 : index
+// CHECK:           %[[MASK:.*]] = vector.create_mask %[[SIZE]], %[[C1]] : vector<8x1xi1>
+// CHECK:           %[[C0:.*]] = arith.constant 0 : index
+// CHECK:           %[[READ:.*]] = vector.mask %[[MASK]] { vector.transfer_read %[[SRC_SIZE]]{{\[}}%[[C0]], %[[C0]]], %[[PAD]] : tensor<?x1xi32>, vector<8x1xi32> } : vector<8x1xi1> -> vector<8x1xi32>
+// CHECK:           %[[C_0_1:.*]] = arith.constant 0 : index
+// CHECK:           %[[WRITE:.*]] = vector.mask %[[MASK]] { vector.transfer_write %[[READ]], %[[INIT]]{{\[}}%[[C_0_1]], %[[C_2]]] : vector<8x1xi32>, tensor<?x3xi32> } : vector<8x1xi1> -> tensor<?x3xi32>
+// CHECK:           return %[[WRITE]] : tensor<?x3xi32>
+
+ module attributes {transform.with_named_sequence} {
+  transform.named_sequence @__transform_main(%arg0: !transform.any_op {transform.readonly}) {
+    %0 = transform.structured.match ops{["tensor.insert_slice"]} in %arg0 : (!transform.any_op) -> !transform.any_op
+    transform.structured.vectorize %0 vector_sizes [8, 1] : !transform.any_op
+    transform.yield
+  }
+ }