[mlir][Vector] Add fold transpose(shape_cast) -> shape_cast

MacDue · MacDue · commit bd64a9b50fe4 · 2023-12-06T12:40:16.000Z
This folds transpose(shape_cast) into a new shape_cast, when the transpose just permutes a unit dim from the result of the shape_cast. Example: ``` %0 = vector.shape_cast %vec : vector<[4]xf32> to vector<[4]x1xf32> %1 = vector.transpose %0, [1, 0] : vector<[4]x1xf32> to vector<1x[4]xf32> ``` Folds to: ``` %0 = vector.shape_cast %vec : vector<[4]xf32> to vector<1x[4]xf32> ``` This is an (alternate) fix for lowering matmuls to ArmSME. --- Corrected version of llvm#73951.
diff --git a/mlir/lib/Dialect/Vector/IR/VectorOps.cpp b/mlir/lib/Dialect/Vector/IR/VectorOps.cpp
@@ -5548,12 +5548,69 @@ class FoldTransposeCreateMask final : public OpRewritePattern<TransposeOp> {
   }
 };
 
+/// Folds transpose(shape_cast) into a new shape_cast, when the transpose just
+/// permutes a unit dim from the result of the shape_cast.
+class FoldTransposeShapeCast : public OpRewritePattern<TransposeOp> {
+  using OpRewritePattern::OpRewritePattern;
+
+  LogicalResult matchAndRewrite(TransposeOp transpOp,
+                                PatternRewriter &rewriter) const override {
+    Value transposeSrc = transpOp.getVector();
+    auto shapeCastOp = transposeSrc.getDefiningOp<vector::ShapeCastOp>();
+    if (!shapeCastOp)
+      return rewriter.notifyMatchFailure(
+          transpOp, "TransposeOp source is not ShapeCastOp");
+
+    auto shapeCastSourceType = shapeCastOp.getSourceVectorType();
+    auto sourceType = transpOp.getSourceVectorType();
+    auto resultType = transpOp.getResultVectorType();
+    auto permutation = transpOp.getPermutation();
+
+    auto getSourceDim = [&](int64_t index) {
+      return std::make_pair(sourceType.getDimSize(index),
+                            sourceType.getScalableDims()[index]);
+    };
+
+    auto unitDim = std::make_pair(int64_t(1), false);
+    for (auto [i, resultIndex] : llvm::enumerate(permutation)) {
+      int64_t sourceIndex = int64_t(i);
+      if (sourceIndex == resultIndex)
+        continue;
+      // Is the transpose crosses any non-unit dims this is also a non-unit
+      // transpose, so we restrict the index distance to 1:
+      // e.g.:
+      //   vector.transpose %0, [0, 3, 2, 1] : vector<2x1x2x5xi32> to
+      //                                       vector<2x5x2x1xi32>
+      // This could be relaxed to checking if all dims between the `sourceIndex`
+      // and `resultIndex` are unit dims (in both the source and result vector
+      // type).
+      if (std::abs(sourceIndex - resultIndex) != 1 ||
+          (getSourceDim(sourceIndex) != unitDim &&
+           getSourceDim(resultIndex) != unitDim)) {
+        return rewriter.notifyMatchFailure(
+            transpOp, "TransposeOp has non-unit permutation");
+      }
+    }
+
+    if (!isValidShapeCast(shapeCastSourceType.getShape(),
+                          resultType.getShape()))
+      return rewriter.notifyMatchFailure(
+          transpOp, "TransposeOp cannot fold into valid ShapeCastOp");
+
+    rewriter.replaceOpWithNewOp<vector::ShapeCastOp>(transpOp, resultType,
+                                                     shapeCastOp.getSource());
+
+    return success();
+  };
+};
+
 } // namespace
 
 void vector::TransposeOp::getCanonicalizationPatterns(
     RewritePatternSet &results, MLIRContext *context) {
   results.add<FoldTransposeCreateMask, FoldTransposedScalarBroadcast,
-              TransposeFolder, FoldTransposeSplat>(context);
+              TransposeFolder, FoldTransposeSplat, FoldTransposeShapeCast>(
+      context);
 }
 
 //===----------------------------------------------------------------------===//
diff --git a/mlir/test/Dialect/Vector/canonicalize.mlir b/mlir/test/Dialect/Vector/canonicalize.mlir
@@ -67,6 +67,70 @@ func.func @create_mask_transpose_to_transposed_create_mask(
 
 // -----
 
+// CHECK-LABEL: transposed_unit_dim_shape_cast_to_shape_cast
+//  CHECK-SAME: %[[VEC:.*]]: vector<[4]xf32>
+func.func @transposed_unit_dim_shape_cast_to_shape_cast(%vec: vector<[4]xf32>) -> vector<1x[4]xf32> {
+  //     CHECK: vector.shape_cast %[[VEC]] : vector<[4]xf32> to vector<1x[4]xf32>
+  // CHECK-NOT: vector.transpose
+  %0 = vector.shape_cast %vec : vector<[4]xf32> to vector<[4]x1xf32>
+  // 0 -> 1 is a unit dim:
+  %1 = vector.transpose %0, [1, 0] : vector<[4]x1xf32> to vector<1x[4]xf32>
+  return %1 : vector<1x[4]xf32>
+}
+
+// -----
+
+// CHECK-LABEL: transposed_multiple_unit_dim_shape_cast_to_shape_cast
+//  CHECK-SAME: %[[VEC:.*]]: vector<120xf32>
+func.func @transposed_multiple_unit_dim_shape_cast_to_shape_cast(%vec: vector<120xf32>) -> vector<2x1x3x4x1x5xf32> {
+  //     CHECK: vector.shape_cast %[[VEC]] : vector<120xf32> to vector<2x1x3x4x1x5xf32>
+  // CHECK-NOT: vector.transpose
+  %0 = vector.shape_cast %vec : vector<120xf32> to vector<1x2x3x4x5x1xf32>
+  // 0 -> 1 and 4 -> 5 are unit dims:
+  %1 = vector.transpose %0, [1, 0, 2, 3, 5, 4] : vector<1x2x3x4x5x1xf32> to vector<2x1x3x4x1x5xf32>
+  return %1 : vector<2x1x3x4x1x5xf32>
+}
+
+// -----
+
+// CHECK-LABEL: transposed_non_unit_dim_shape_cast_0
+//  CHECK-SAME: %[[VEC:.*]]: vector<120xf32>
+func.func @transposed_non_unit_dim_shape_cast_0(%vec: vector<120xf32>) -> vector<1x3x2x4x1x5xf32> {
+  //      CHECK: %[[SHAPE_CAST:.*]] = vector.shape_cast %[[VEC]] : vector<120xf32> to vector<1x2x3x4x5x1xf32>
+  // CHECK-NEXT: vector.transpose %[[SHAPE_CAST]], [0, 2, 1, 3, 5, 4] : vector<1x2x3x4x5x1xf32> to vector<1x3x2x4x1x5xf32>
+  %0 = vector.shape_cast %vec : vector<120xf32> to vector<1x2x3x4x5x1xf32>
+  // 1 -> 2 is a non-unit dim:
+  %1 = vector.transpose %0, [0, 2, 1, 3, 5, 4] : vector<1x2x3x4x5x1xf32> to vector<1x3x2x4x1x5xf32>
+  return %1 : vector<1x3x2x4x1x5xf32>
+}
+// -----
+
+// CHECK-LABEL: transposed_non_unit_dim_shape_cast_1
+//  CHECK-SAME: %[[VEC:.*]]: vector<1x256x256xf32>
+func.func @transposed_non_unit_dim_shape_cast_1(%vec: vector<1x256x256xf32>) -> vector<256x256xf32> {
+  //      CHECK: %[[SHAPE_CAST:.*]] = vector.shape_cast %[[VEC]] : vector<1x256x256xf32> to vector<256x256xf32>
+  // CHECK-NEXT: vector.transpose %[[SHAPE_CAST]], [1, 0] : vector<256x256xf32> to vector<256x256xf32>
+  %0 = vector.shape_cast %vec : vector<1x256x256xf32> to vector<256x256xf32>
+  // 0 -> 1 is a non-unit dim:
+  %1 = vector.transpose %0, [1, 0] : vector<256x256xf32> to vector<256x256xf32>
+  return %1 : vector<256x256xf32>
+}
+
+// -----
+
+// CHECK-LABEL: transposed_non_unit_dim_shape_cast_2
+//  CHECK-SAME: %[[VEC:.*]]: vector<20xf32>
+func.func @transposed_non_unit_dim_shape_cast_2(%vec: vector<20xf32>) -> vector<2x5x2x1xf32> {
+  //      CHECK: %[[SHAPE_CAST:.*]] = vector.shape_cast %[[VEC]] : vector<20xf32> to vector<2x1x2x5xf32>
+  // CHECK-NEXT: vector.transpose %[[SHAPE_CAST]], [0, 3, 2, 1] : vector<2x1x2x5xf32> to vector<2x5x2x1xf32>
+  %0 = vector.shape_cast %vec : vector<20xf32> to vector<2x1x2x5xf32>
+  // 1 -> 3 transposes non-unit dims:
+  %1 = vector.transpose %0, [0, 3, 2, 1] : vector<2x1x2x5xf32> to vector<2x5x2x1xf32>
+  return %1 : vector<2x5x2x1xf32>
+}
+
+// -----
+
 // CHECK-LABEL: extract_from_create_mask
 //  CHECK-SAME: %[[DIM0:.*]]: index, %[[DIM1:.*]]: index
 func.func @extract_from_create_mask(%dim0: index, %dim1: index) -> vector<[4]x[4]xi1> {