[mlir][linalg] Split GenericPadOpVectorizationPattern into two patterns

banach-space · banach-space · commit 45318f3a950b · 2024-10-26T14:05:43.000+01:00
At the moment, `GenericPadOpVectorizationPattern` implements two
orthogonal transformations:
  1. Rewrites `tensor::PadOp` into a sequence of `tensor::EmptyOp`,
    `linalg::FillOp` and `tensor::InsertSliceOp`.
  2. Vectorizes (where possible) `tensor::InsertSliceOp` (see
    `tryVectorizeCopy`).

This patch splits `GenericPadOpVectorizationPattern` into two separate
patterns:
  1. `GeneralizePadOpPattern` for the first transformation (note that
    currently `GenericPadOpVectorizationPattern` inherits from
    `GeneralizePadOpPattern`).
  2. `InsertSliceVectorizePattern` to vectorize `tensor::InsertSliceOp`.

With this change, we gain the following:
  * a clear separation between pre-processing and vectorization
    transformations/stages,
  * a path to support masked vectorisation for `tensor.insert_slice`
    (with a dedicated pattern for vectorization, it is much easier to
    specify the input vector sizes used in masking),
  * more opportunities to vectorize `tensor.insert_slice`.

Note for downstream users:
--------------------------

If you were using `populatePadOpVectorizationPatterns`, following this
change you will also have to add
`populateInsertSliceVectorizationPatterns`.

Finer implementation details:
-----------------------------

1. The majority of changes in this patch are copy &amp; paste + some edits.

  1.1 The only functional change is that the vectorization of
    `tensor.insert_slice` is now broadly available (as opposed to being
    constrained to the pad vectorization pattern:
    `GenericPadOpVectorizationPattern`).

  1.2 Following-on from the above, `@pad_and_insert_slice_dest` is
    updated. As expected, the input `tensor.insert_slice` Op is no
    longer "preserved" and instead gets vectorized successfully.

2. The `linalg.fill` case in `getConstantPadVal` works under the
   assumption that only _scalar_ source values can be used. That's
   consistent with the definition of the Op, but it's not tested at the
   moment. Hence a test case in Linalg/invalid.mlir is added.

3. The behaviour of the two TD vectorization Ops,
   `transform.structured.vectorize_children_and_apply_patterns` and
   `transform.structured.vectorize` is preserved.
diff --git a/mlir/include/mlir/Dialect/Linalg/Transforms/Transforms.h b/mlir/include/mlir/Dialect/Linalg/Transforms/Transforms.h
@@ -1503,18 +1503,13 @@ using OptimizeCopyFn =
 
 /// Rewrite a tensor::PadOp into a sequence of EmptyOp, FillOp and
 /// InsertSliceOp. For now, only constant padding values are supported.
-/// `OptimizeCopyFn` can be used to customize copying step optimization.
 struct GeneralizePadOpPattern : public OpRewritePattern<tensor::PadOp> {
-  GeneralizePadOpPattern(MLIRContext *context,
-                         OptimizeCopyFn optimizeCopyFn = nullptr,
-                         PatternBenefit benefit = 1)
-      : OpRewritePattern<tensor::PadOp>(context, benefit),
-        optimizeCopyFn(std::move(optimizeCopyFn)) {}
+  GeneralizePadOpPattern(MLIRContext *context, PatternBenefit benefit = 1)
+      : OpRewritePattern<tensor::PadOp>(context, benefit) {}
   LogicalResult matchAndRewrite(tensor::PadOp padOp,
                                 PatternRewriter &rewriter) const override;
 
 protected:
-  OptimizeCopyFn optimizeCopyFn;
   Value createFillOrGenerateOp(RewriterBase &rewriter, tensor::PadOp padOp,
                                Value dest,
                                const SmallVector<Value> &dynSizes) const;
@@ -1663,6 +1658,11 @@ void populateDecomposeConvolutionPatterns(RewritePatternSet &patterns,
 /// \see rewriteInIm2Col for more details.
 void populateConvertConv2DToImg2ColPatterns(RewritePatternSet &patterns);
 
+/// Populates `patterns` with vectorisation patterns for tensor.insert_slice.
+/// TODO: Avoid having a dedicated `populate{}` for one pattern. Instead, either
+/// expand or merge with other `populate{}`.
+void populateInsertSliceVectorizationPatterns(RewritePatternSet &patterns);
+
 /// Populates `patterns` with patterns that vectorize tensor.pad.
 /// These patterns are meant to apply in a complementary fashion. Benefits
 /// are used to encode a certain ordering of pattern application. To avoid
diff --git a/mlir/lib/Dialect/Linalg/TransformOps/LinalgTransformOps.cpp b/mlir/lib/Dialect/Linalg/TransformOps/LinalgTransformOps.cpp
@@ -3482,6 +3482,9 @@ transform::VectorizeChildrenAndApplyPatternsOp::applyToOne(
 
   patterns.add<CopyVectorizationPattern>(ctx);
 
+  // Add misc. vectorization patterns (e.g. for tensor.insert_slice)
+  linalg::populateInsertSliceVectorizationPatterns(patterns);
+
   if (getVectorizePadding())
     linalg::populatePadOpVectorizationPatterns(patterns);
 
diff --git a/mlir/lib/Dialect/Linalg/Transforms/Transforms.cpp b/mlir/lib/Dialect/Linalg/Transforms/Transforms.cpp
@@ -973,12 +973,7 @@ GeneralizePadOpPattern::matchAndRewrite(tensor::PadOp padOp,
       padOp.getLoc(), staticSizes, resultType.getElementType(), dynSizes);
   Value fill = createFillOrGenerateOp(rewriter, padOp, emptyTensor, dynSizes);
 
-  // Try optimize the copy of source.
-  if (optimizeCopyFn && optimizeCopyFn(rewriter, padOp, fill).succeeded())
-    return success();
-
-  // tensor::PadOps cannot be optimized. Generate a InsertSliceOp instead
-  // for copying the PadOp source.
+  // Generate a InsertSliceOp for copying the PadOp source.
   auto sourceType = padOp.getSourceType();
   // Compute size of source of tensor::PadOp.
   SmallVector<OpFoldResult> srcSizes =
diff --git a/mlir/lib/Dialect/Linalg/Transforms/Vectorization.cpp b/mlir/lib/Dialect/Linalg/Transforms/Vectorization.cpp
@@ -2281,115 +2281,6 @@ LogicalResult mlir::linalg::vectorizeCopy(RewriterBase &rewriter,
 //----------------------------------------------------------------------------//
 // Misc. vectorization patterns.
 //----------------------------------------------------------------------------//
-
-/// Helper function that retrieves the value of an IntegerAttr.
-static int64_t getIntFromAttr(Attribute attr) {
-  return cast<IntegerAttr>(attr).getInt();
-}
-
-/// Given an ArrayRef of OpFoldResults, return a vector of Values.
-/// IntegerAttrs are converted to ConstantIndexOps. Other attribute types are
-/// not supported.
-static SmallVector<Value> ofrToIndexValues(RewriterBase &rewriter, Location loc,
-                                           ArrayRef<OpFoldResult> ofrs) {
-  SmallVector<Value> result;
-  for (auto o : ofrs) {
-    if (auto val = llvm::dyn_cast_if_present<Value>(o)) {
-      result.push_back(val);
-    } else {
-      result.push_back(rewriter.create<arith::ConstantIndexOp>(
-          loc, getIntFromAttr(o.template get<Attribute>())));
-    }
-  }
-  return result;
-}
-
-/// Rewrite a tensor::PadOp into a sequence of EmptyOp, FillOp and
-/// InsertSliceOp. For now, only constant padding values are supported.
-/// If there is enough static type information, TransferReadOps and
-/// TransferWriteOps may be generated instead of InsertSliceOps.
-struct GenericPadOpVectorizationPattern : public GeneralizePadOpPattern {
-  GenericPadOpVectorizationPattern(MLIRContext *context,
-                                   PatternBenefit benefit = 1)
-      : GeneralizePadOpPattern(context, tryVectorizeCopy, benefit) {}
-  /// Vectorize the copying of a tensor::PadOp's source. This is possible if
-  /// each dimension size is statically know in the source type or the result
-  /// type (or both).
-  static LogicalResult tryVectorizeCopy(RewriterBase &rewriter,
-                                        tensor::PadOp padOp, Value dest) {
-    auto sourceType = padOp.getSourceType();
-    auto resultType = padOp.getResultType();
-    if (!VectorType::isValidElementType(sourceType.getElementType()))
-      return failure();
-
-    // Copy cannot be vectorized if pad value is non-constant and source shape
-    // is dynamic. In case of a dynamic source shape, padding must be appended
-    // by TransferReadOp, but TransferReadOp supports only constant padding.
-    auto padValue = padOp.getConstantPaddingValue();
-    if (!padValue) {
-      if (!sourceType.hasStaticShape())
-        return failure();
-      // Create dummy padding value.
-      auto elemType = sourceType.getElementType();
-      padValue = rewriter.create<arith::ConstantOp>(
-          padOp.getLoc(), elemType, rewriter.getZeroAttr(elemType));
-    }
-
-    SmallVector<int64_t> vecShape;
-    SmallVector<bool> readInBounds;
-    SmallVector<bool> writeInBounds;
-    for (unsigned i = 0; i < sourceType.getRank(); ++i) {
-      if (!sourceType.isDynamicDim(i)) {
-        vecShape.push_back(sourceType.getDimSize(i));
-        // Source shape is statically known: Neither read nor write are
-        // out-of- bounds.
-        readInBounds.push_back(true);
-        writeInBounds.push_back(true);
-      } else if (!resultType.isDynamicDim(i)) {
-        // Source shape is not statically known, but result shape is.
-        // Vectorize with size of result shape. This may be larger than the
-        // source size.
-        vecShape.push_back(resultType.getDimSize(i));
-        // Read may be out-of-bounds because the result size could be larger
-        // than the source size.
-        readInBounds.push_back(false);
-        // Write is out-of-bounds if low padding > 0.
-        writeInBounds.push_back(
-            getConstantIntValue(padOp.getMixedLowPad()[i]) ==
-            static_cast<int64_t>(0));
-      } else {
-        // Neither source nor result dim of padOp is static. Cannot vectorize
-        // the copy.
-        return failure();
-      }
-    }
-    auto vecType = VectorType::get(vecShape, sourceType.getElementType());
-
-    // Generate TransferReadOp.
-    SmallVector<Value> readIndices(
-        vecType.getRank(),
-        rewriter.create<arith::ConstantIndexOp>(padOp.getLoc(), 0));
-    auto read = rewriter.create<vector::TransferReadOp>(
-        padOp.getLoc(), vecType, padOp.getSource(), readIndices, padValue,
-        ArrayRef<bool>{readInBounds});
-
-    // If `dest` is a FillOp and the TransferWriteOp would overwrite the
-    // entire tensor, write directly to the FillOp's operand.
-    if (llvm::equal(vecShape, resultType.getShape()) &&
-        llvm::all_of(writeInBounds, [](bool b) { return b; }))
-      if (auto fill = dest.getDefiningOp<FillOp>())
-        dest = fill.output();
-
-    // Generate TransferWriteOp.
-    auto writeIndices =
-        ofrToIndexValues(rewriter, padOp.getLoc(), padOp.getMixedLowPad());
-    rewriter.replaceOpWithNewOp<vector::TransferWriteOp>(
-        padOp, read, dest, writeIndices, ArrayRef<bool>{writeInBounds});
-
-    return success();
-  }
-};
-
 /// Base pattern for rewriting tensor::PadOps whose result is consumed by a
 /// given operation type OpTy.
 template <typename OpTy>
@@ -2623,6 +2514,177 @@ struct PadOpVectorizationWithTransferWritePattern
   }
 };
 
+/// Given an ArrayRef of OpFoldResults, return a vector of Values.
+/// IntegerAttrs are converted to ConstantIndexOps. Other attribute types are
+/// not supported.
+static SmallVector<Value> ofrToIndexValues(RewriterBase &rewriter, Location loc,
+                                           ArrayRef<OpFoldResult> ofrs) {
+  SmallVector<Value> result;
+  for (auto o : ofrs) {
+    if (auto val = llvm::dyn_cast_if_present<Value>(o)) {
+      result.push_back(val);
+    } else {
+      result.push_back(rewriter.create<arith::ConstantIndexOp>(
+          loc, cast<IntegerAttr>(cast<Attribute>(o)).getInt()));
+    }
+  }
+  return result;
+}
+
+/// Returns the effective Pad value for the input op, provided it's a scalar.
+///
+/// Many Ops exhibit pad-like behaviour, but this isn't always explicit. If
+/// this Op performs padding, retrieve the padding value provided that it's
+/// a scalar and static/fixed for all the padded values. Returns an empty value
+/// otherwise.
+static Value getStaticPadVl(Operation *op) {
+  if (!op)
+    return {};
+
+  // 1. vector.broadcast - return the value that's being broadcast,
+  // provided that it's a scalar.
+  if (auto bcast = llvm::dyn_cast<vector::BroadcastOp>(op)) {
+    auto source = bcast.getSource();
+    if (llvm::dyn_cast<VectorType>(source.getType()))
+      return {};
+
+    return source;
+  }
+
+  // 1. linalg.fill - use the scalar input value that used to fill the output
+  // tensor.
+  if (auto fill = llvm::dyn_cast<linalg::FillOp>(op)) {
+    return fill.getInputs()[0];
+  }
+
+  // 2. tensor.generateOp - can't guarantee the value is fixed without
+  // analysing, bail out.
+  if (auto generate = llvm::dyn_cast<tensor::GenerateOp>(op)) {
+    return {};
+  }
+
+  // 3. vector.transfer_write - inspect the input vector that's written from. If
+  // if contains a single value that has been broadcast (e.g. via
+  // vector.broadcast), extract it, fail otherwise.
+  if (auto xferWrite = llvm::dyn_cast<vector::TransferWriteOp>(op))
+    return getStaticPadVl(xferWrite.getVector().getDefiningOp());
+
+  // 4. tensor.insert_slice - inspect the destination tensor. If it's larger
+  // than the input tensor, then, provided it's constant, we'll extract the
+  // value that was used to generate it (via e.g. linalg.fill), fail otherwise.
+  // TODO: Clarify the semantics when the input tensor is larger than the
+  // destination.
+  if (auto slice = llvm::dyn_cast<tensor::InsertSliceOp>(op))
+    return getStaticPadVl(slice.getDest().getDefiningOp());
+
+  return {};
+}
+
+/// Rewrite tensor.insert.slice as a vector.transfer_read +
+/// vector.transfer_write pair. The vector size is inferred from the static
+/// dims in the input and output tensors. If a dim is dynamic in both the input
+/// and output tensors, bails out.
+///
+/// Before:
+///     !t_in_type = tensor<1x2x3xf32>
+///     !t_out_type = tensor<9x8x7x1x2x3xf32>
+///     !v_type = vector<1x2x3xf32>
+///     %inserted_slice = tensor.insert_slice %src into %dest ... : !t_in_type
+///     into !t_out_type
+/// After:
+///     %read = vector.transfer_read %src[...], %pad ... : !t_in_type, !v_type
+///     %write = vector.transfer_write %read, %dest ... : !v_type, !t_out_type
+///
+/// TODO: Support masking
+struct InsertSliceVectorizePattern
+    : public OpRewritePattern<tensor::InsertSliceOp> {
+  using OpRewritePattern<tensor::InsertSliceOp>::OpRewritePattern;
+
+  LogicalResult matchAndRewrite(tensor::InsertSliceOp sliceOp,
+                                PatternRewriter &rewriter) const final {
+    auto sourceType = sliceOp.getSource().getType();
+    if (!VectorType::isValidElementType(sourceType.getElementType()))
+      return failure();
+
+    auto resultType = sliceOp.getResultType();
+
+    // 1. Get the pad value.
+    // TransferReadOp requires a scalar padding value. Note that:
+    //    * for in-bounds access, the value is actually irrelevant.
+    //  There are 2 cases in which xfer.read accesses are known to be in-bounds:
+    //  1. The source shape is static (output vector sizes would be based on
+    //     the source shape and hence all memory accesses would be in-bounds),
+    //  2. Masking is used (output vector sizes would be user-provided, in which
+    //     case it is assumed that all memory accesses are in-bounds). This
+    //     remains a TODO.
+    //
+    // When the value is not known and not needed, use 0. Otherwise, bail out.
+    Value padValue = getStaticPadVl(sliceOp);
+    bool isOutOfBoundsRead = !sourceType.hasStaticShape();
+
+    if (!padValue && isOutOfBoundsRead) {
+      LDBG("Failed to get a pad value for out-of-bounds read access\n");
+      return failure();
+    }
+
+    if (!padValue) {
+      auto elemType = sourceType.getElementType();
+      padValue = rewriter.create<arith::ConstantOp>(
+          sliceOp.getLoc(), elemType, rewriter.getZeroAttr(elemType));
+    }
+
+    // 2. Get the vector shape and in-bounds attributes
+    SmallVector<int64_t> vecShape;
+    SmallVector<bool> readInBounds;
+    SmallVector<bool> writeInBounds;
+    for (unsigned i = 0; i < sourceType.getRank(); ++i) {
+      if (!sourceType.isDynamicDim(i)) {
+        vecShape.push_back(sourceType.getDimSize(i));
+        // Source shape is statically known: Neither read nor write are
+        // out-of-bounds.
+        readInBounds.push_back(true);
+        writeInBounds.push_back(true);
+      } else if (!resultType.isDynamicDim(i)) {
+        // Source shape is not statically known, but result shape is.
+        // Vectorize with size of result shape. This may be larger than the
+        // source size.
+        vecShape.push_back(resultType.getDimSize(i));
+        // Read may be out-of-bounds because the result size could be larger
+        // than the source size.
+        readInBounds.push_back(false);
+        // Write will in-bounds provided that the corresponding write idx is 0.
+        // To keep this logic simple, conservatively mark as out-of-bounds.
+        writeInBounds.push_back(false);
+      } else {
+        // Neither source nor result dim of padOp is static. Cannot vectorize
+        // the copy.
+        // TODO: Add support for masking
+        return failure();
+      }
+    }
+    auto vecType = VectorType::get(vecShape, sourceType.getElementType());
+
+    // 3. Generate TransferReadOp.
+    SmallVector<Value> readIndices(
+        vecType.getRank(),
+        rewriter.create<arith::ConstantIndexOp>(sliceOp.getLoc(), 0));
+    auto read = rewriter.create<vector::TransferReadOp>(
+        sliceOp.getLoc(), vecType, sliceOp.getSource(), readIndices, padValue,
+        ArrayRef<bool>{readInBounds});
+
+    // 4. Generate TransferWriteOp.
+    auto writeIndices =
+        ofrToIndexValues(rewriter, sliceOp.getLoc(), sliceOp.getMixedOffsets());
+
+    // 5. Finalize
+    rewriter.replaceOpWithNewOp<vector::TransferWriteOp>(
+        sliceOp, read, sliceOp.getDest(), writeIndices,
+        ArrayRef<bool>{writeInBounds});
+
+    return success();
+  }
+};
+
 /// Rewrite use of tensor::PadOp result in InsertSliceOp. E.g.:
 /// ```
 /// %0 = tensor.pad %src ... : tensor<?x?xf32> to tensor<17x5xf32>
@@ -2710,13 +2772,18 @@ struct PadOpVectorizationWithInsertSlicePattern
   }
 };
 
+void mlir::linalg::populateInsertSliceVectorizationPatterns(
+    RewritePatternSet &patterns) {
+  patterns.add<InsertSliceVectorizePattern>(patterns.getContext());
+}
+
 void mlir::linalg::populatePadOpVectorizationPatterns(
     RewritePatternSet &patterns, PatternBenefit baseBenefit) {
   // TODO: The following pattern implements "decomposition" and
   // optional "vectorization". Seperate "decomposition" into a sepereate
   // pre-processing pattern group.
-  patterns.add<GenericPadOpVectorizationPattern>(patterns.getContext(),
-                                                 baseBenefit);
+  patterns.add<GeneralizePadOpPattern>(patterns.getContext(), baseBenefit);
+
   // Try these specialized patterns first before resorting to the generic one.
   patterns.add<PadOpVectorizationWithTransferReadPattern,
                PadOpVectorizationWithTransferWritePattern,
diff --git a/mlir/test/Dialect/Linalg/invalid.mlir b/mlir/test/Dialect/Linalg/invalid.mlir
@@ -352,6 +352,15 @@ func.func @illegal_fill_tensor_with_memref_return
 
 // -----
 
+func.func @illegal_fill_value_type(%arg0 : tensor<2x2xf32>, %arg1 : tensor<2xf32>) -> tensor<2x2xf32>
+{
+  // expected-error @+1 {{expected op with scalar input}}
+  %0 = linalg.fill ins(%arg1 : tensor<2xf32>) outs(%arg0 : tensor<2x2xf32>) -> tensor<2x2xf32>
+  return %0 : tensor<2x2xf32>
+}
+
+// -----
+
 func.func @invalid_static_matmul(%arg0: memref<2x4xf32>, %arg1: memref<3x4xf32>, %arg2: memref<2x4xf32>) {
   // expected-error @+1 {{inferred input/output operand #1 has shape's dimension #0 to be 4, but found 3}}
   linalg.matmul ins(%arg0, %arg1 : memref<2x4xf32>, memref<3x4xf32>)
diff --git a/mlir/test/Dialect/Linalg/vectorization-with-patterns.mlir b/mlir/test/Dialect/Linalg/vectorization-with-patterns.mlir