[mlir][linalg] Restrict scalable vectorisation (#98639)

banach-space · web-flow · commit cfe043cf99f7 · 2024-07-19T08:05:10.000+01:00
Updates `vectorizeScalableVectorPrecondition` so that scalable
vectorisation is only applied in well understood and tested scenarios.

It's unlikely that we would ever want an arbitrary dimension to be
scalable. While the Linalg vectoriser should be flexible enough to
handle all possibilities:
  * in more "exotic" cases, we are likely to struggle with lowerings
    further down the compilation stack,
  * it would be impractical given the limitations of LLVM (which usually
    reflect the limitations of actual hardware) - e.g. no support for
    "scalable" arrays of scalable or fixed width vectors (*).

Ultimately, the goal of this patch is to better document what's
currently supported. While this PR adds some new restrictions, no
existing tests are affected.

(*) At MLIR vector level that would correspond to e.g.
`vector&lt;[4]x8xf32&gt;`.
diff --git a/mlir/lib/Dialect/Linalg/Transforms/Vectorization.cpp b/mlir/lib/Dialect/Linalg/Transforms/Vectorization.cpp
@@ -1945,26 +1945,79 @@ vectorizePadOpPrecondition(tensor::PadOp padOp,
   return success();
 }
 
-/// Preconditions for scalable vectors.
+/// Preconditions for scalable vectors. This is quite restrictive - it models
+/// the fact that in practice we would only make selected dimensions scalable.
 static LogicalResult
 vectorizeScalableVectorPrecondition(Operation *op,
                                     ArrayRef<int64_t> inputVectorSizes,
                                     ArrayRef<bool> inputScalableVecDims) {
   assert(inputVectorSizes.size() == inputScalableVecDims.size() &&
          "Number of input vector sizes and scalable dims doesn't match");
 
-  if (inputVectorSizes.empty())
-    return success();
+  size_t numOfScalableDims =
+      llvm::count_if(inputScalableVecDims, [](bool flag) { return flag; });
 
-  bool isScalable = inputScalableVecDims.back();
-  if (!isScalable)
+  if (numOfScalableDims == 0)
     return success();
 
-  // Only element-wise and 1d depthwise conv ops supported in the presence of
-  // scalable dims.
   auto linalgOp = dyn_cast<LinalgOp>(op);
-  return success(linalgOp && (isElementwise(linalgOp) ||
-                              isa<linalg::DepthwiseConv1DNwcWcOp>(op)));
+
+  // Cond 1: There's been no need for scalable vectorisation of
+  // non-linalg Ops so far
+  if (!linalgOp)
+    return failure();
+
+  // Cond 2: There's been no need for more than 2 scalable dims so far
+  if (numOfScalableDims > 2)
+    return failure();
+
+  // Cond 3: Look at the configuration in `inputScalableVecDims` and verify that
+  // it matches one of the supported cases:
+  //  1. exactly 1 dim is scalable and that's the _last_ parallel dim
+  //  2. exactly 2 dims are scalable and those are the _last two adjacent_
+  //     parallel dims
+  // The 2nd restriction above means that only Matmul-like Ops are supported
+  // when 2 dims are scalable, e.g. :
+  //    * iterators = [parallel, parallel, reduction]
+  //    * scalable flags = [true, true, false]
+
+  // Find the first scalable flag
+  bool seenParalell = false;
+  auto iterators = linalgOp.getIteratorTypesArray();
+  SmallVector<bool> scalableFlags(inputScalableVecDims);
+  while (!scalableFlags.back()) {
+    seenParalell |= (iterators.back() == utils::IteratorType::parallel);
+
+    iterators.pop_back();
+    scalableFlags.pop_back();
+  }
+
+  // TODO: Support scalable vectorisation for reduction dims
+  if (iterators.back() == utils::IteratorType::reduction)
+    return failure();
+
+  // If this is not the _last_ parallel dim, 1. above is not met
+  if (seenParalell)
+    return failure();
+
+  // If present, check the 2nd scalable dim. ATM, only Matmul-like Ops are
+  // supported for which expect the folowing config:
+  //    * iterators = [parallel, parallel, reduction]
+  //    * scalable flags = [true, true, false]
+  if (numOfScalableDims == 2) {
+    scalableFlags.pop_back();
+    iterators.pop_back();
+
+    if (!scalableFlags.back() ||
+        (iterators.back() != utils::IteratorType::parallel))
+      return failure();
+  }
+
+  // Cond 4: Only the following ops are supported in the
+  // presence of scalable vectors
+  return success(isElementwise(linalgOp) || isa<linalg::MatmulOp>(op) ||
+                 isa<linalg::MatmulTransposeAOp>(op) ||
+                 isa<linalg::DepthwiseConv1DNwcWcOp>(op));
 }
 
 LogicalResult mlir::linalg::vectorizeOpPrecondition(
diff --git a/mlir/test/Dialect/Linalg/vectorization-unsupported.mlir b/mlir/test/Dialect/Linalg/vectorization-unsupported.mlir
@@ -110,7 +110,7 @@ module attributes {transform.with_named_sequence} {
   }
 }
 
-  // -----
+// -----
 
 func.func @test_pack_no_vectorize_dynamic_shape(%arg0: tensor<?xf32>, %arg1: tensor<4x16xf32>) -> tensor<4x16xf32> {
   %pad = arith.constant 0.000000e+00 : f32
@@ -126,3 +126,68 @@ module attributes {transform.with_named_sequence} {
     transform.yield
   }
 }
+
+// -----
+
+func.func @linalg_reduce_scalable(%input: tensor<?xf32>,
+                                  %acc: tensor<f32>) -> tensor<f32> {
+
+  // expected-error @+1 {{Attempted to vectorize, but failed}}
+  %0 = linalg.reduce ins(%input : tensor<?xf32>) outs(%acc : tensor<f32>) dimensions = [0]
+  (%in: f32, %init: f32) {
+    %0 = arith.addf %in, %init : f32
+    linalg.yield %0 : f32
+  }
+  return %0 : tensor<f32>
+}
+
+module attributes {transform.with_named_sequence} {
+  transform.named_sequence @__transform_main(%arg1: !transform.any_op {transform.readonly}) {
+    %0 = transform.structured.match ops{["linalg.reduce"]} in %arg1 : (!transform.any_op) -> !transform.any_op
+    transform.structured.vectorize %0 vector_sizes [[4]] : !transform.any_op
+    transform.yield
+  }
+}
+
+// -----
+
+func.func @linalg_generic_scalable_reduction_dim(%input: tensor<?x?xf32>,
+                                                 %acc: tensor<?xf32>) -> tensor<?xf32> {
+
+  // expected-error @+1 {{Attempted to vectorize, but failed}}
+  %0 = linalg.generic { indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>,
+                                         affine_map<(d0, d1) -> (d0)>],
+                        iterator_types = ["parallel", "reduction"] }
+    ins(%input : tensor<?x?xf32>)
+    outs(%acc : tensor<?xf32>) {
+    ^bb(%in: f32, %out: f32) :
+      %0 = arith.addf %in, %out : f32
+      linalg.yield %0 : f32
+    } -> tensor<?xf32>
+  return %0 : tensor<?xf32>
+}
+
+module attributes {transform.with_named_sequence} {
+  transform.named_sequence @__transform_main(%arg1: !transform.any_op {transform.readonly}) {
+    %0 = transform.structured.match ops{["linalg.generic"]} in %arg1 : (!transform.any_op) -> !transform.any_op
+    transform.structured.vectorize %0 vector_sizes [1, [4]] : !transform.any_op
+    transform.yield
+  }
+}
+
+// -----
+
+func.func @linalg_matmul_scalable_leading_parallel_dim(%A: memref<?x?xf32>, %B: memref<?x?xf32>, %C: memref<?x?xf32>) {
+  // expected-error @+1 {{Attempted to vectorize, but failed}}
+  linalg.matmul ins(%A, %B: memref<?x?xf32>, memref<?x?xf32>)
+            outs(%C: memref<?x?xf32>)
+  return
+}
+
+module attributes {transform.with_named_sequence} {
+  transform.named_sequence @__transform_main(%arg1: !transform.any_op {transform.readonly}) {
+    %matmul = transform.structured.match ops{["linalg.matmul"]} in %arg1 : (!transform.any_op) -> !transform.any_op
+    transform.structured.vectorize %matmul vector_sizes [[8], 16, 4] : !transform.any_op
+    transform.yield
+  }
+}