Use full slices when tiling by the full loop trip count (to support non-monotonic expressions) (#468)

josel-amd · web-flow · commit 479c8d676f1f · 2025-02-13T16:26:56.000+01:00
When tiling a chain of linalg.ops, we can only set the tile sizes of the first one to 0 to say untiled, but producers of it will get a tile size of &lt;loop trip count&gt;.
We must return the full slice in those case because the code that computes the slices sizes in the general case doesn't handle non-monotonic affine expressions.
Otherwise we would generate invalid code for non-monotonic expressions even if all involved dimensions are effectively untiled.
diff --git a/mlir/include/mlir/IR/AffineExpr.h b/mlir/include/mlir/IR/AffineExpr.h
@@ -110,11 +110,6 @@ class AffineExpr {
   /// floordiv, ceildiv, and mod is only allowed w.r.t constants.
   bool isPureAffine() const;
 
-  /// Returns true if this expression is monotonicically increasing with respect
-  /// to the AffineDimExprs, i.e. increasing the value of any AffineDimExpr will
-  /// never decrease the value of the result.
-  bool isMonotonicallyIncreasing() const;
-
   /// Returns the greatest known integral divisor of this affine expression. The
   /// result is always positive.
   int64_t getLargestKnownDivisor() const;
diff --git a/mlir/include/mlir/IR/AffineMap.h b/mlir/include/mlir/IR/AffineMap.h
@@ -382,10 +382,6 @@ class AffineMap {
   /// Returns true if the AffineMap represents a symbol-less permutation map.
   bool isPermutation() const;
 
-  // Returns true if every result is monotonically increasing.
-  // See AffineExpr::isMonotonicallyIncreasing().
-  bool isComponentWiseMonotonicallyIncreasing() const;
-
   /// Returns the map consisting of the `resultPos` subset.
   AffineMap getSubMap(ArrayRef<unsigned> resultPos) const;
 
diff --git a/mlir/lib/Dialect/Linalg/Transforms/TilingInterfaceImpl.cpp b/mlir/lib/Dialect/Linalg/Transforms/TilingInterfaceImpl.cpp
@@ -115,13 +115,16 @@ struct LinalgOpTilingInterface
   getTiledImplementation(Operation *op, OpBuilder &b,
                          ArrayRef<OpFoldResult> offsets,
                          ArrayRef<OpFoldResult> sizes) const {
-    // Leave the `sizeBounds` value empty. That is only needed when the `sizes`
-    // specified could lead to out of bounds accesses.
     Location loc = op->getLoc();
     LinalgOp linalgOp = cast<LinalgOp>(op);
+    SmallVector<OpFoldResult> allShapeSizes =
+        linalgOp.createFlatListOfOperandDims(b, linalgOp.getLoc());
+    SmallVector<OpFoldResult> sizeBounds =
+        mlir::affine::makeComposedFoldedMultiResultAffineApply(
+            b, loc, linalgOp.getShapesToLoopsMap(), allShapeSizes);
     SmallVector<Value> valuesToTile = linalgOp->getOperands();
     SmallVector<Value> tiledOperands = makeTiledShapes(
-        b, loc, linalgOp, valuesToTile, offsets, sizes, {}, true);
+        b, loc, linalgOp, valuesToTile, offsets, sizes, sizeBounds, true);
     SmallVector<Operation *> generatedSlices = llvm::map_to_vector(
         llvm::make_filter_range(
             tiledOperands,
diff --git a/mlir/lib/Dialect/Linalg/Utils/Utils.cpp b/mlir/lib/Dialect/Linalg/Utils/Utils.cpp
@@ -56,10 +56,23 @@ namespace {
 //   `d0 + 2 * d1 + d3` is tiled by [0, 0, 0, 2] but not by [0, 0, 2, 0]
 //
 struct TileCheck : public AffineExprVisitor<TileCheck> {
-  TileCheck(ArrayRef<OpFoldResult> tileSizes) : tileSizes(tileSizes) {}
+  TileCheck(ArrayRef<OpFoldResult> tileSizes, ArrayRef<OpFoldResult> sizeBounds)
+      : tileSizes(tileSizes), sizeBounds(sizeBounds) {}
 
   void visitDimExpr(AffineDimExpr expr) {
-    isTiled |= !isZeroIndex(tileSizes[expr.getPosition()]);
+    unsigned pos = expr.getPosition();
+
+    // This dimension is tiled if the tile size is larger than zero and not
+    // equal to its domain size (if statically known).
+    std::optional<int64_t> tileSize = getConstantIntValue(tileSizes[pos]);
+    if (tileSize && !sizeBounds.empty()) {
+      std::optional<int64_t> sizeBound = getConstantIntValue(sizeBounds[pos]);
+      if (sizeBound && *sizeBound == *tileSize) {
+        return;
+      }
+    }
+
+    isTiled |= !isZeroIndex(tileSizes[pos]);
   }
   void visitAffineBinaryOpExpr(AffineBinaryOpExpr expr) {
     visit(expr.getLHS());
@@ -70,24 +83,27 @@ struct TileCheck : public AffineExprVisitor<TileCheck> {
   }
   bool isTiled = false;
   ArrayRef<OpFoldResult> tileSizes;
+  ArrayRef<OpFoldResult> sizeBounds;
 };
 
 } // namespace
 
-static bool isTiled(AffineExpr expr, ArrayRef<OpFoldResult> tileSizes) {
+static bool isTiled(AffineExpr expr, ArrayRef<OpFoldResult> tileSizes,
+                    ArrayRef<OpFoldResult> sizeBounds) {
   if (!expr)
     return false;
-  TileCheck t(tileSizes);
+  TileCheck t(tileSizes, sizeBounds);
   t.visit(expr);
   return t.isTiled;
 }
 
 // Checks whether the `map  varies with respect to a non-zero `tileSize`.
-static bool isTiled(AffineMap map, ArrayRef<OpFoldResult> tileSizes) {
+static bool isTiled(AffineMap map, ArrayRef<OpFoldResult> tileSizes,
+                    ArrayRef<OpFoldResult> sizeBounds) {
   if (!map)
     return false;
   for (unsigned r = 0; r < map.getNumResults(); ++r)
-    if (isTiled(map.getResult(r), tileSizes))
+    if (isTiled(map.getResult(r), tileSizes, sizeBounds))
       return true;
   return false;
 }
@@ -585,7 +601,7 @@ computeSliceParameters(OpBuilder &builder, Location loc, Value valueToTile,
     // The offset & size computation below only handles the case when
     // the map is monotonically increasing, i.e. the min and max values are
     // attained at the lower and upper bounds of the iteration domain.
-    if (!isTiled(m, tileSizes) || !m.isComponentWiseMonotonicallyIncreasing()) {
+    if (!isTiled(m, tileSizes, ubs)) {
       sliceParams.offsets.push_back(builder.getIndexAttr(0));
       OpFoldResult dim = createFoldedDimOp(builder, loc, valueToTile, r);
       sliceParams.sizes.push_back(dim);
@@ -784,10 +800,9 @@ computeAllSliceParameters(OpBuilder &builder, Location loc, LinalgOp linalgOp,
     // transformations such as padding and bufferization since the
     // extract/insert slice pairs make the accessed iteration argument
     // subdomains explicit.
-
     Type operandType = opOperand.get().getType();
-    if (!isTiled(map, tileSizes) && !(isa<RankedTensorType>(operandType) &&
-                                      linalgOp.isDpsInit(&opOperand))) {
+    if (!isTiled(map, tileSizes, {}) && !(isa<RankedTensorType>(operandType) &&
+                                          linalgOp.isDpsInit(&opOperand))) {
       allSliceParams.push_back(std::nullopt);
       LLVM_DEBUG(llvm::dbgs()
                  << ": not tiled: use shape: " << operandType << "\n");
diff --git a/mlir/lib/IR/AffineExpr.cpp b/mlir/lib/IR/AffineExpr.cpp
@@ -239,42 +239,6 @@ bool AffineExpr::isPureAffine() const {
   llvm_unreachable("Unknown AffineExpr");
 }
 
-static bool isNonNegativeConstant(AffineExpr expr) {
-  auto constant = dyn_cast<AffineConstantExpr>(expr);
-  return constant && constant.getValue() >= 0;
-}
-
-bool AffineExpr::isMonotonicallyIncreasing() const {
-  switch (getKind()) {
-  case AffineExprKind::SymbolId:
-  case AffineExprKind::DimId:
-  case AffineExprKind::Constant:
-    return true;
-  case AffineExprKind::Add: {
-    auto op = llvm::cast<AffineBinaryOpExpr>(*this);
-    return op.getLHS().isMonotonicallyIncreasing() &&
-           op.getRHS().isMonotonicallyIncreasing();
-  }
-  case AffineExprKind::Mul: {
-    // One operand must be a non-negative constant.
-    auto op = llvm::cast<AffineBinaryOpExpr>(*this);
-    return op.getLHS().isMonotonicallyIncreasing() &&
-           op.getRHS().isMonotonicallyIncreasing() &&
-           (isNonNegativeConstant(op.getLHS()) ||
-            isNonNegativeConstant(op.getRHS()));
-  }
-  case AffineExprKind::FloorDiv:
-  case AffineExprKind::CeilDiv: {
-    auto op = llvm::cast<AffineBinaryOpExpr>(*this);
-    return op.getLHS().isMonotonicallyIncreasing() &&
-           isNonNegativeConstant(op.getRHS());
-  }
-  case AffineExprKind::Mod:
-    return false;
-  }
-  llvm_unreachable("Unknown AffineExpr");
-}
-
 // Returns the greatest known integral divisor of this affine expression.
 int64_t AffineExpr::getLargestKnownDivisor() const {
   AffineBinaryOpExpr binExpr(nullptr);
diff --git a/mlir/lib/IR/AffineMap.cpp b/mlir/lib/IR/AffineMap.cpp
@@ -651,11 +651,6 @@ bool AffineMap::isPermutation() const {
   return isProjectedPermutation();
 }
 
-bool AffineMap::isComponentWiseMonotonicallyIncreasing() const {
-  return all_of(getResults(),
-                [](auto expr) { return expr.isMonotonicallyIncreasing(); });
-}
-
 AffineMap AffineMap::getSubMap(ArrayRef<unsigned> resultPos) const {
   SmallVector<AffineExpr, 4> exprs;
   exprs.reserve(resultPos.size());
diff --git a/mlir/test/Dialect/Linalg/tile-tensors.mlir b/mlir/test/Dialect/Linalg/tile-tensors.mlir
@@ -171,31 +171,31 @@ module attributes {transform.with_named_sequence} {
 // -----
 
 // CHECK-LABEL: func @non_monotonic_affine_expr
-//  CHECK-SAME:   %[[ARG0:[a-zA-Z0-9_]+]]: tensor<?xf32>
-func.func @non_monotonic_affine_expr(%arg0 : tensor<?xf32>) -> tensor<?xf32> {
+//  CHECK-SAME:   %[[ARG0:[a-zA-Z0-9_]+]]: tensor<7xf32>
+func.func @non_monotonic_affine_expr(%arg0 : tensor<7xf32>) -> tensor<7xf32> {
   %c0 = arith.constant 0 : index
-  %0 = tensor.dim %arg0, %c0 : tensor<?xf32>
-  %empty = tensor.empty(%0) : tensor<?xf32>
+  %0 = tensor.dim %arg0, %c0 : tensor<7xf32>
+  %empty = tensor.empty() : tensor<7xf32>
 
-  // CHECK: scf.for
-  // CHECK: %[[SIZE:[a-zA-Z0-9_]+]] = tensor.dim %[[ARG0]],
-  // CHECK: tensor.extract_slice %[[ARG0]][0] [%[[SIZE]]] [1] : tensor<?xf32> to tensor<?xf32>
+  // CHECK: %[[OUT:.*]] = tensor.empty() : tensor<7xf32>
+  // CHECK: scf.for {{.*}} to {{.*}} step {{.*}} iter_args(%[[TC0:.*]] = %[[OUT]]) -> (tensor<7xf32>) {
+  // CHECK: tensor.extract_slice %[[TC0]][0] [7] [1] : tensor<7xf32> to tensor<7xf32>
   %generic = linalg.generic
-    {indexing_maps = [affine_map<(d0) -> (d0 mod 3)>,
+    {indexing_maps = [affine_map<(d0) -> (d0 mod 4)>,
                       affine_map<(d0) -> (d0)>],
      iterator_types = ["parallel"]}
-    ins(%arg0: tensor<?xf32>)
-    outs(%empty : tensor<?xf32>) {
+    ins(%arg0: tensor<7xf32>)
+    outs(%empty : tensor<7xf32>) {
     ^bb0(%in : f32, %out: f32):
       linalg.yield %in : f32
-    } -> tensor<?xf32>
-  return %generic : tensor<?xf32>
+    } -> tensor<7xf32>
+  return %generic : tensor<7xf32>
 }
 
 module attributes {transform.with_named_sequence} {
   transform.named_sequence @__transform_main(%arg1: !transform.any_op {transform.readonly}) {
     %0 = transform.structured.match ops{["linalg.generic"]} in %arg1 : (!transform.any_op) -> !transform.any_op
-    %1, %loop = transform.structured.tile_using_for %0 tile_sizes [100] : (!transform.any_op) -> (!transform.any_op, !transform.any_op)
+    %1, %loop = transform.structured.tile_using_for %0 tile_sizes [7] : (!transform.any_op) -> (!transform.any_op, !transform.any_op)
     transform.yield
   }
 }
diff --git a/mlir/test/Dialect/Linalg/transform-op-fuse-into-containing.mlir b/mlir/test/Dialect/Linalg/transform-op-fuse-into-containing.mlir
@@ -555,11 +555,12 @@ module {
 
       // CHECK: %[[T1:.*]] = linalg.generic {{.*}}
       // CHECK: %[[T2:.*]] = linalg.generic {{.*}}
+      // CHECK: %[[T3:.*]] = linalg.generic {{.*}}
       %7 = tensor.extract_slice %1[%4] [%5] [1] : tensor<?xf32> to tensor<?xf32>
 
       %8 = linalg.elemwise_unary ins(%7 : tensor<?xf32>) outs(%6 : tensor<?xf32>) -> tensor<?xf32>
       scf.forall.in_parallel {
-        // CHECK: tensor.parallel_insert_slice %[[T2]] into %[[ARG7]][%[[I0]]] [%[[I1]]] [1] : tensor<?xf32> into tensor<?xf32>
+        // CHECK: tensor.parallel_insert_slice %[[T3]] into %[[ARG7]][%[[I0]]] [%[[I1]]] [1] : tensor<?xf32> into tensor<?xf32>
         tensor.parallel_insert_slice %8 into %o[%2] [%5] [1] : tensor<?xf32> into tensor<?xf32>
       }
     }

Original file line number	Diff line number	Diff line change
`@@ -555,11 +555,12 @@ module {`
`555`	`555`
`556`	`556`	`// CHECK: %[[T1:.]] = linalg.generic {{.}}`
`557`	`557`	`// CHECK: %[[T2:.]] = linalg.generic {{.}}`
	`558`	`+ // CHECK: %[[T3:.]] = linalg.generic {{.}}`
`558`	`559`	`%7 = tensor.extract_slice %1[%4] [%5] [1] : tensor<?xf32> to tensor<?xf32>`
`559`	`560`
`560`	`561`	`%8 = linalg.elemwise_unary ins(%7 : tensor<?xf32>) outs(%6 : tensor<?xf32>) -> tensor<?xf32>`
`561`	`562`	`scf.forall.in_parallel {`
`562`		`- // CHECK: tensor.parallel_insert_slice %[[T2]] into %[[ARG7]][%[[I0]]] [%[[I1]]] [1] : tensor<?xf32> into tensor<?xf32>`
	`563`	`+ // CHECK: tensor.parallel_insert_slice %[[T3]] into %[[ARG7]][%[[I0]]] [%[[I1]]] [1] : tensor<?xf32> into tensor<?xf32>`
`563`	`564`	`tensor.parallel_insert_slice %8 into %o[%2] [%5] [1] : tensor<?xf32> into tensor<?xf32>`
`564`	`565`	`}`
`565`	`566`	`}`