fixup! fixup! [mlir][linalg] Refactor vectorization hooks to improve code reuse

banach-space · banach-space · commit 42b17836544b · 2025-05-29T21:14:48.000+01:00
* Restore the changes in transform-e2e.mlir + transform-vector.mlir
* Updated in_bounds attribute calculation in `createWriteOrMaskedWrite`
  - otherwise transform-e2e.mlir goes into an infite loop. I will create
    a repro and open a GitHub issue before landing this.
* The in_bounds attribute calculaiton is incorrect and I will create a
  GitHub ticket to fix it before merging this. See the comments in this
  patch.
diff --git a/mlir/lib/Dialect/Linalg/Transforms/Vectorization.cpp b/mlir/lib/Dialect/Linalg/Transforms/Vectorization.cpp
@@ -1659,9 +1659,10 @@ createWriteOrMaskedWrite(OpBuilder &builder, Location loc, Value vectorToStore,
                static_cast<size_t>(vecToStoreType.getRank()) &&
            "Insufficient number of input vector sizes!");
     // Update the inBounds attribute.
+    // FIXME: This computation is too weak - it ignores the write indices.
     for (unsigned i = 0; i < vecToStoreRank; i++)
       inBoundsVal[i] =
-          (destShape[i] == inputVecSizesForLeadingDims[i]) &&
+          (destShape[i] >= inputVecSizesForLeadingDims[i]) &&
           !ShapedType::isDynamic(destShape[destRank - vecToStoreRank + i]);
   }
 
diff --git a/mlir/lib/Dialect/Vector/Utils/VectorUtils.cpp b/mlir/lib/Dialect/Vector/Utils/VectorUtils.cpp
@@ -346,6 +346,7 @@ Value vector::createReadOrMaskedRead(OpBuilder &builder, Location loc,
 
   if (useInBoundsInsteadOfMasking) {
     // Update the inBounds attribute.
+    // FIXME: This computation is too weak - it ignores the read indices.
     for (unsigned i = 0; i < readRank; i++)
       inBoundsVal[i] = (sourceShape[i] == inputVectorSizes[i]) &&
                        !ShapedType::isDynamic(sourceShape[i]);
diff --git a/mlir/test/Dialect/LLVM/transform-e2e.mlir b/mlir/test/Dialect/LLVM/transform-e2e.mlir
@@ -18,14 +18,16 @@ module attributes {transform.with_named_sequence} {
     %1, %loops:3 = transform.structured.tile_using_for %0 tile_sizes [2, 2, 2] : (!transform.any_op) -> (!transform.any_op, !transform.any_op, !transform.any_op, !transform.any_op)
     %2 = transform.get_parent_op %1 {isolated_from_above} : (!transform.any_op) -> !transform.any_op
     transform.structured.vectorize_children_and_apply_patterns %2 : (!transform.any_op) -> !transform.any_op
+    %b = transform.bufferization.one_shot_bufferize layout{IdentityLayoutMap}
+        %module_op {bufferize_function_boundaries = true}
+        : (!transform.any_op) -> !transform.any_op
 
-    %f = transform.structured.match ops{["func.func"]} in %module_op
+    %f = transform.structured.match ops{["func.func"]} in %b
       : (!transform.any_op) -> !transform.any_op
 
     // TODO: group these lower-level controls into various properly named vector
     // lowering TD macros.
     transform.apply_patterns to %f {
-      transform.apply_patterns.vector.lower_masked_transfers
       transform.apply_patterns.vector.lower_contraction lowering_strategy = "outerproduct"
       transform.apply_patterns.vector.transfer_permutation_patterns
       transform.apply_patterns.vector.lower_multi_reduction lowering_strategy = "innerparallel"
@@ -35,10 +37,6 @@ module attributes {transform.with_named_sequence} {
       transform.apply_patterns.vector.lower_shape_cast
       transform.apply_patterns.vector.lower_transpose lowering_strategy = "shuffle_1d"
     } : !transform.any_op
-
-    %b = transform.bufferization.one_shot_bufferize layout{IdentityLayoutMap}
-        %module_op {bufferize_function_boundaries = true}
-        : (!transform.any_op) -> !transform.any_op
     transform.yield
   }
 }
diff --git a/mlir/test/Dialect/Vector/transform-vector.mlir b/mlir/test/Dialect/Vector/transform-vector.mlir
@@ -6,11 +6,7 @@ func.func @matmul_tensors(
     -> tensor<8x32xf32> {
 // CHECK-NOT: linalg
 // CHECK: vector.extract {{.*}} : vector<4xf32> from vector<8x4xf32>
-// TODO: `vector.maskedstore` below could safely be replaced with
-// `vector.store`. It's present due to the vectorization logic for
-// `tensor.insert_slice` conservatively applying masks. However, it this case,
-// we should be able to remove it via value-bounds checks.
-// CHECK: vector.maskedstore {{.*}} : memref<8x32xf32>, vector<4xi1>, vector<4xf32>
+// CHECK: vector.store {{.*}} : memref<8x32xf32>, vector<4xf32>
   %0 = linalg.matmul  ins(%arg0, %arg1: tensor<8x16xf32>, tensor<16x32xf32>)
                      outs(%arg2: tensor<8x32xf32>)
     -> tensor<8x32xf32>
@@ -24,16 +20,16 @@ module attributes {transform.with_named_sequence} {
       : (!transform.any_op) -> (!transform.any_op, !transform.any_op, !transform.any_op, !transform.any_op)
     %2 = transform.get_parent_op %1 {isolated_from_above} : (!transform.any_op) -> !transform.any_op
     transform.structured.vectorize_children_and_apply_patterns %2 : (!transform.any_op) -> !transform.any_op
+    %b = transform.bufferization.one_shot_bufferize
+        layout{IdentityLayoutMap} %module_op
+        {bufferize_function_boundaries = true, allow_return_allocs = true}
+        : (!transform.any_op) -> !transform.any_op
 
-    %f = transform.structured.match ops{["func.func"]} in %module_op
+    %f = transform.structured.match ops{["func.func"]} in %b
       : (!transform.any_op) -> !transform.any_op
 
     // TODO: group these lower-level controls into various properly named vector
     // lowering TD macros.
-    transform.apply_patterns to %f {
-      transform.apply_patterns.vector.lower_masked_transfers
-    } : !transform.any_op
-
     transform.apply_patterns to %f {
       transform.apply_patterns.vector.lower_contraction lowering_strategy = "outerproduct"
     } : !transform.any_op
@@ -50,37 +46,21 @@ module attributes {transform.with_named_sequence} {
       transform.apply_patterns.vector.split_transfer_full_partial split_transfer_strategy = "linalg-copy"
     } : !transform.any_op
 
-    // By default, UnrollTransferWriteConversion (applied below via
-    // `transfer_to_scf`) will only work on MemRef(s). While there's an option
-    // to relax that, it's currently not wired-up with the TD logic. Bufferize
-    // here as otherwise unrolling will not work.
-    // TODO: Extend `transform.apply_patterns.vector.transfer_to_scf` to allow
-    // unrolling xfer Ops on tensors and move bufferization all the way down.
-    %b = transform.bufferization.one_shot_bufferize
-        layout{IdentityLayoutMap} %module_op
-        {bufferize_function_boundaries = true, allow_return_allocs = true}
-        : (!transform.any_op) -> !transform.any_op
-
-    %fb = transform.structured.match ops{["func.func"]} in %b
-      : (!transform.any_op) -> !transform.any_op
-
-    transform.apply_patterns to %fb {
+    transform.apply_patterns to %f {
       transform.apply_patterns.vector.transfer_to_scf max_transfer_rank = 1 full_unroll = true
     } : !transform.any_op
 
-    transform.apply_patterns to %fb {
+    transform.apply_patterns to %f {
       transform.apply_patterns.vector.lower_transfer max_transfer_rank = 1
     } : !transform.any_op
 
-    transform.apply_patterns to %fb {
+    transform.apply_patterns to %f {
       transform.apply_patterns.vector.lower_shape_cast
     } : !transform.any_op
 
-    transform.apply_patterns to %fb {
+    transform.apply_patterns to %f {
       transform.apply_patterns.vector.lower_transpose lowering_strategy = "shuffle_1d"
     } : !transform.any_op
-
-
     transform.yield
   }
 }