fixup! [mlir][tensor] Extend the logic to generalise tensor.pack

banach-space · banach-space · commit affd836e1049 · 2024-10-02T07:41:43.000+01:00
Address PR comments from Han-Chung. Some clean-up and also relaxing the
requirement that the dynamic dim has to be a constant multiple of
vector.vscale.
diff --git a/mlir/include/mlir/Dialect/Tensor/Utils/Utils.h b/mlir/include/mlir/Dialect/Tensor/Utils/Utils.h
@@ -14,10 +14,10 @@
 namespace mlir {
 namespace tensor {
 
-// Return a PadOp that pads `source` to `type` size where the static
-// sizes are assumed to be greater than the dynamic sizes. If `type` has dynamic
-// dimensions the padding width is set to zero. The op performs "high" padding
-// (i.e. it adds trailing padding values until the desired size is met).
+// Return a PadOp that pads `source` to `type` size. Output sizes (from `type`)
+// are assumed to be static and greater than the potentially dynamic input sizes
+// (from `source`). The op performs "high" padding (i.e. it adds trailing
+// padding values until the desired size is met).
 PadOp createPadHighOp(RankedTensorType type, Value source, Value pad,
                       bool nofold, Location loc, OpBuilder &builder,
                       std::optional<Value> dynOutDim = {});
diff --git a/mlir/lib/Dialect/Linalg/Transforms/Transforms.cpp b/mlir/lib/Dialect/Linalg/Transforms/Transforms.cpp
@@ -1028,9 +1028,8 @@ LogicalResult ExtractSliceOfPadTensorSwapPattern::matchAndRewrite(
 /// This method assumes that all outer dims for this pack Op are 1.
 ///
 /// At most _one_ inner tile size can be _dynamic_, all other inner tiles are
-/// required to have static sizes. The inner tile that's dynamic must be a
-/// multiple of vector.vscale (to support scalable tile sizes). This condition
-/// can be relaxed in the future.
+/// required to have static sizes. This restriction can be relaxed in the
+/// future.
 static Value getPackOpSourceOrPaddedSource(OpBuilder &builder,
                                            tensor::PackOp packOp) {
   Value input = packOp.getSource();
@@ -1049,8 +1048,8 @@ static Value getPackOpSourceOrPaddedSource(OpBuilder &builder,
   DenseMap<int64_t, OpFoldResult> tileAndPosMapping =
       packOp.getDimAndTileMapping();
 
-  // The size of a scalable tile (if present).
-  Value scalableSize;
+  // The size of a dynamic tile (if present).
+  Value dynamicTileSize;
 
   // Collect dims for the padded shape.
   SmallVector<int64_t> paddedShape;
@@ -1080,16 +1079,15 @@ static Value getPackOpSourceOrPaddedSource(OpBuilder &builder,
     // 2.2 Dynamic tile sizes
     paddedShape.push_back(ShapedType::kDynamic);
 
-    // Get the value that holds the scalable size.
-    assert(!scalableSize && "Only one scalable size is supported ATM.");
-    scalableSize = llvm::dyn_cast_if_present<Value>(tileSizeForDim);
-    assert(vector::getConstantVscaleMultiplier(scalableSize) &&
-           "This dynamic shape is not a multiple of vscale, this !");
+    // Get the value that holds the dynamic size.
+    assert(!dynamicTileSize && "Only one dynamic tile is supported ATM.");
+    dynamicTileSize = llvm::dyn_cast_if_present<Value>(tileSizeForDim);
   }
   auto resultType =
       RankedTensorType::get(paddedShape, inputType.getElementType());
   return tensor::createPadHighOp(resultType, input, packOp.getPaddingValue(),
-                                 /*nofold=*/false, loc, builder, scalableSize);
+                                 /*nofold=*/false, loc, builder,
+                                 dynamicTileSize);
 }
 
 // Normalizes a permutation on a higher rank space to its actual size, e.g.
@@ -1152,14 +1150,6 @@ getPackUnpackRankReducedPerm(ArrayRef<int64_t> shape,
 
 LogicalResult GeneralizeOuterUnitDimsPackOpPattern::matchAndRewrite(
     tensor::PackOp packOp, PatternRewriter &rewriter) const {
-  if (llvm::any_of(packOp.getMixedTiles(), [](OpFoldResult tile) {
-        return tile.is<Value>() && !vector::getConstantVscaleMultiplier(
-                                       llvm::dyn_cast<Value>(tile));
-      })) {
-    return rewriter.notifyMatchFailure(
-        packOp, "require inner tile sizes to be either static or a constant "
-                "multiple of vector.vscale");
-  }
   if (llvm::count_if(packOp.getMixedTiles(),
                      [](OpFoldResult tile) { return tile.is<Value>(); }) > 1) {
     return rewriter.notifyMatchFailure(
@@ -1221,22 +1211,20 @@ LogicalResult GeneralizeOuterUnitDimsPackOpPattern::matchAndRewrite(
   SmallVector<int64_t> transpShape = readShape;
   applyPermutationToVector<int64_t>(transpShape, perm);
 
-  // If there's a tile with a scalable size, retrieve its size. ATM only 1
-  // scalable tile is allowed.
-  Value scalableSize;
+  // If there's a tile with a dynamic size, retrieve its size. ATM only 1
+  // dynamic tile is allowed.
+  Value dynDimSize;
   for (auto tile : packOp.getMixedTiles()) {
     if (tile.is<Value>()) {
-      assert(!scalableSize && "Only one scalable size is supported ATM.");
-      scalableSize = cast<Value>(tile);
-      assert(vector::getConstantVscaleMultiplier(scalableSize) &&
-             "This dynamic shape is not a multiple of vscale!");
+      assert(!dynDimSize && "Only one scalable size is supported ATM.");
+      dynDimSize = cast<Value>(tile);
     }
   }
 
   Value empty =
       ShapedType::isDynamicShape(cast<ShapedType>(input.getType()).getShape())
           ? rewriter.create<tensor::EmptyOp>(loc, transpShape, elemType,
-                                             scalableSize)
+                                             dynDimSize)
           : rewriter.create<tensor::EmptyOp>(loc, transpShape, elemType);
   auto transposedOp =
       rewriter.create<linalg::TransposeOp>(loc, tile, empty, perm);
diff --git a/mlir/lib/Dialect/Tensor/Utils/Utils.cpp b/mlir/lib/Dialect/Tensor/Utils/Utils.cpp
@@ -26,37 +26,30 @@ PadOp mlir::tensor::createPadHighOp(RankedTensorType type, Value source,
                                     Value pad, bool nofold, Location loc,
                                     OpBuilder &b,
                                     std::optional<Value> dynOutDim) {
-  assert(llvm::count_if(
-             type.getShape(),
-             [](int64_t dim) { return ShapedType::isDynamic(dim); }) <= 1 &&
+
+  assert(type.getNumDynamicDims() <= 1 &&
          "At most one output dim can be dynamic!");
 
   // Init "low" and "high" padding values ("low" is kept as is, "high" is
   // computed below).
   SmallVector<OpFoldResult> low(type.getRank(), b.getIndexAttr(0));
   SmallVector<OpFoldResult> high(type.getRank(), b.getIndexAttr(0));
-  for (const auto &en : enumerate(type.getShape())) {
-    if (!ShapedType::isDynamic(en.value())) {
-      // Static sizes - the "high" value is computed based on the input and
-      // output dims. Compute the padding width.
-      AffineExpr d0;
-      bindDims(b.getContext(), d0);
-      OpFoldResult sz = tensor::getMixedSize(b, loc, source, en.index());
-      high[en.index()] =
-          affine::makeComposedFoldedAffineApply(b, loc, en.value() - d0, {sz});
-    } else {
-      // Dynamic sizes - the "high" value is computed based on the input dim
-      // and `dynOutDim`.
-      assert(dynOutDim.has_value() &&
-             "dynamic output dim requires dynOutDim to be set");
-
-      // Compute the padding width.
-      AffineExpr d0, d1;
-      auto dimVal = b.create<tensor::DimOp>(loc, source, en.index());
-      bindDims(b.getContext(), d0, d1);
-      high[en.index()] = affine::makeComposedFoldedAffineApply(
-          b, loc, d0 - d1, {dynOutDim.value(), dimVal.getResult()});
-    }
+
+  for (const auto [idx, val] : enumerate(type.getShape())) {
+    bool isOutDimDynamic = ShapedType::isDynamic(val);
+    assert((!isOutDimDynamic || dynOutDim.has_value()) &&
+           "dynamic output dim requires dynOutDim to be set");
+
+    // Compute the padding width: outDim - srcDim.
+    AffineExpr d0, d1;
+    bindDims(b.getContext(), d0, d1);
+    OpFoldResult srcDim = tensor::getMixedSize(b, loc, source, idx);
+    Value outDim = isOutDimDynamic
+                       ? dynOutDim.value()
+                       : b.create<arith::ConstantIndexOp>(loc, val).getResult();
+
+    high[idx] = affine::makeComposedFoldedAffineApply(b, loc, d0 - d1,
+                                                      {outDim, srcDim});
   }
   return b.create<PadOp>(loc, type, source, low, high, pad, nofold);
 }
diff --git a/mlir/test/Dialect/Linalg/generalize-tensor-pack.mlir b/mlir/test/Dialect/Linalg/generalize-tensor-pack.mlir
@@ -23,7 +23,7 @@ func.func @simple_pad_and_pack(%input: tensor<5x1xf32>, %output: tensor<1x1x8x2x
   %0 = tensor.pack %input padding_value(%pad : f32) inner_dims_pos = [0, 1] inner_tiles = [8, 2] into %output : tensor<5x1xf32> -> tensor<1x1x8x2xf32>
   return %0 : tensor<1x1x8x2xf32>
 }
-// CHECK: #[[$ATTR_0:.+]] = affine_map<()[s0, s1] -> (s0 - s1)>
+// CHECK: #[[$ATTR_0:.+]] = affine_map<()[s0] -> (s0 - 5)>
 
 // CHECK-LABEL: func.func @simple_pad_and_pack
 // CHECK-SAME:    %[[SRC:[a-zA-Z0-9]+]]
@@ -36,7 +36,29 @@ func.func @simple_pad_and_pack(%input: tensor<5x1xf32>, %output: tensor<1x1x8x2x
 // CHECK-SAME:      [0, 0, 0, 0] [1, 1, 8, 2] [1, 1, 1, 1]
 // CHECK:         return %[[INSERT]]
 
-/// Same as example above, but with scalable sizes.
+/// Same as example above, but with dynamic tile size.
+
+func.func @simple_pad_and_pack_dynamic(%input: tensor<5x1xf32>, %output: tensor<1x1x?x2xf32>, %pad: f32, %high: index) -> tensor<1x1x?x2xf32> {
+  %0 = tensor.pack %input padding_value(%pad : f32) inner_dims_pos = [0, 1] inner_tiles = [%high, 2] into %output : tensor<5x1xf32> -> tensor<1x1x?x2xf32>
+  return %0 : tensor<1x1x?x2xf32>
+}
+
+// CHECK-LABEL:   func.func @simple_pad_and_pack_dynamic(
+// CHECK-SAME:      %[[SRC:[a-zA-Z0-9]+]]
+// CHECK-SAME:      %[[DEST:[a-zA-Z0-9]+]]
+// CHECK-SAME:      %[[PAD_VAL:[a-zA-Z0-9]+]]
+// CHECK-SAME:      %[[HIGH_VAL:.*]]: index) -> tensor<1x1x?x2xf32> {
+// CHECK:           %[[C2:.*]] = arith.constant 2 : index
+// CHECK:           %[[PAD_HIGH:.*]] = affine.apply #[[$ATTR_0]](){{\[}}%[[HIGH_VAL]]]
+// CHECK:           %[[PAD:.*]] = tensor.pad %[[SRC]] low[0, 0] high{{\[}}%[[PAD_HIGH]], 1] {
+// CHECK:             tensor.yield %[[PAD_VAL]] : f32
+// CHECK-NOT:       linalg.transpose
+// CHECK:           %[[SLICE:.*]] = tensor.extract_slice %[[VAL_10:.*]][0, 0] {{\[}}%[[HIGH_VAL]], 2] [1, 1] : tensor<?x2xf32> to tensor<?x2xf32>
+// CHECK:           %[[DIM:.*]] = tensor.dim %[[DEST]], %[[C2]] : tensor<1x1x?x2xf32>
+// CHECK:           %[[RES:.*]] = tensor.insert_slice %[[SLICE]] into %[[DEST]][0, 0, 0, 0] [1, 1, %[[DIM]], 2] [1, 1, 1, 1] : tensor<?x2xf32> into tensor<1x1x?x2xf32>
+// CHECK:           return %[[RES]] : tensor<1x1x?x2xf32>
+
+/// Same as example above, but with scalable tile size.
 
 /// NOTE: For this example to make sense in practice, the "?" in the output shape
 ///       should effectively be 8 * vector.vscale (and that's what tensor.dim
@@ -50,17 +72,15 @@ func.func @simple_pad_and_pack_scalable(%input: tensor<5x1xf32>, %output: tensor
   return %0 : tensor<1x1x?x2xf32>
 }
 
-
 // CHECK-LABEL:   func.func @simple_pad_and_pack_scalable(
 // CHECK-SAME:      %[[SRC:[a-zA-Z0-9]+]]: tensor<5x1xf32>,
 // CHECK-SAME:      %[[DEST:[a-zA-Z0-9]+]]: tensor<1x1x?x2xf32>,
 // CHECK-SAME:      %[[PAD_VAL:[a-zA-Z0-9]+]]: f32) -> tensor<1x1x?x2xf32> {
 // CHECK:           %[[C2:.+]] = arith.constant 2 : index
-// CHECK:           %[[C5:.+]] = arith.constant 5 : index
 // CHECK:           %[[C8:.+]] = arith.constant 8 : index
 // CHECK:           %[[VS:.+]] = vector.vscale
 // CHECK:           %[[C8_VS:.+]] = arith.muli %[[VS]], %[[C8]] : index
-// CHECK:           %[[PAD_HIGH:.+]] = affine.apply #[[$ATTR_0]](){{\[}}%[[C8_VS]], %[[C5]]]
+// CHECK:           %[[PAD_HIGH:.*]] = affine.apply #[[$ATTR_0]](){{\[}}%[[C8_VS]]]
 // CHECK:           %[[PAD:.+]] = tensor.pad %[[SRC]] low[0, 0] high{{\[}}%[[PAD_HIGH]], 1] {
 // CHECK:             tensor.yield %[[PAD_VAL]] : f32
 // CHECK-NOT:       linalg.transpose