Fixed all the issues pointed out by HanHan

bviyer · bviyer · commit d5a0dec6194c · 2024-02-09T23:14:22.000Z
and Diego.
diff --git a/mlir/include/mlir/Dialect/Tensor/Utils/Utils.h b/mlir/include/mlir/Dialect/Tensor/Utils/Utils.h
@@ -41,6 +41,11 @@ computeTransposedType(RankedTensorType rankedTensorType,
 SmallVector<int64_t> getPackUnPackInverseDestPerm(
     std::variant<tensor::PackOp, tensor::UnPackOp> packOp);
 
+/// Unpack requires some packing metadata data, so create another
+/// function where this value is passed by reference.
+SmallVector<int64_t> getPackUnPackInverseDestPerm(
+    std::variant<tensor::PackOp, tensor::UnPackOp> packOp,
+    PackingMetadata &PackingMetadata);
 /// A tensor.insert_slice is a cast-like operation if it merely rank-extends the
 /// source tensor or inserts the source tensor into a destination tensor with
 /// the same shape.
diff --git a/mlir/lib/Dialect/Linalg/Transforms/Vectorization.cpp b/mlir/lib/Dialect/Linalg/Transforms/Vectorization.cpp
@@ -1571,11 +1571,12 @@ vectorizeAsTensorPackOp(RewriterBase &rewriter, tensor::PackOp packOp,
   return success();
 }
 
-/// Vectorize an `tensor::UnPackOp` without OuterDimsPerms to these 4 Ops:
-///   Vector::TransferReadOp - Reads the Vector Array of Source data
-///   vector::TransposeOp - Transpose the Source
-///   ShapeCastOp - Reshapes the data based on the target.
-///   vector::TransferWriteOp. - Write the result vector back.
+/// Vectorize a `tensor::UnPackOp` without OuterDimsPerms to these 4 Ops:
+///   Vector::TransferReadOp - Reads a vector from the source tensor
+///   vector::TransposeOp - Transpose the Source tensor
+///   ShapeCastOp - Reshape the data based on the target.
+///   vector::TransferWriteOp. - Write the result vector back to the destination
+///   tensor
 static LogicalResult vectorizeAsUnpackOp(RewriterBase &rewriter,
                                          tensor::UnPackOp unpackOp,
                                          ArrayRef<int64_t> inputVectorSizes,
@@ -1610,26 +1611,21 @@ static LogicalResult vectorizeAsUnpackOp(RewriterBase &rewriter,
     LDBG("Unable to reify result shapes of " << unpackOp);
     return failure();
   }
-  int64_t unpackRank = unpackTensorType.getRank();
   Location loc = unpackOp->getLoc();
 
+  // Read result, mask if necessary.
   Value readResult = createReadOrMaskedRead(
       rewriter, loc, unpackOp.getSource(),
       llvm::ArrayRef<int64_t>(readMaskShape.begin(), readMaskShape.end()),
       nullptr);
 
-  int64_t numPackedDim = unpackOp.getInnerDimsPos().size();
-  llvm::SmallVector<int64_t> lastDims = llvm::to_vector(
-      llvm::seq<int64_t>(unpackRank - numPackedDim, unpackRank));
-  PackingMetadata packMetadata =
-      computePackingMetadata(unpackRank, unpackOp.getInnerDimsPos());
-  SmallVector<int64_t> lastDimToInsertPosPerm = computePermutationVector(
-      unpackRank, lastDims, packMetadata.insertPositions);
+  PackingMetadata packMetadata;
+  SmallVector<int64_t> lastDimToInsertPosPerm = invertPermutationVector(
+      tensor::getPackUnPackInverseDestPerm(unpackOp, packMetadata));
   ShapedType maskedOpShapedType = cast<ShapedType>(readResult.getType());
   SmallVector<int64_t> stripMineShape(maskedOpShapedType.getShape());
   mlir::Type stripMineElemType = maskedOpShapedType.getElementType();
   applyPermutationToVector(stripMineShape, lastDimToInsertPosPerm);
-
   RankedTensorType stripMineTensorType =
       RankedTensorType::Builder(stripMineShape, stripMineElemType, {})
           .setShape(stripMineShape);
@@ -1646,8 +1642,12 @@ static LogicalResult vectorizeAsUnpackOp(RewriterBase &rewriter,
   vector::ShapeCastOp shapeCastOp = rewriter.create<vector::ShapeCastOp>(
       loc, vecCollapsedType, transposeOp->getResult(0));
 
+  // WriteMaskShape had to match the shapecast shape for dynamic sizes,
+  // otherwise the validator complains that the mask size is invalid.
   SmallVector<int64_t> writeMaskShape(
-      shapeCastOp.getResultVectorType().getShape());
+      unpackOp.getDestType().hasStaticShape()
+          ? inputVectorSizes
+          : shapeCastOp.getResultVectorType().getShape());
   Operation *write =
       createWriteOrMaskedWrite(rewriter, loc, shapeCastOp.getResult(),
                                reifiedRetShapes[0], writeMaskShape);
diff --git a/mlir/lib/Dialect/Tensor/Utils/Utils.cpp b/mlir/lib/Dialect/Tensor/Utils/Utils.cpp
@@ -75,18 +75,26 @@ mlir::tensor::computeTransposedType(RankedTensorType rankedTensorType,
 
 SmallVector<int64_t> mlir::tensor::getPackUnPackInverseDestPerm(
     std::variant<tensor::PackOp, tensor::UnPackOp> op) {
+  PackingMetadata pMetaData;
+  return getPackUnPackInverseDestPerm(op, pMetaData);
+}
+
+SmallVector<int64_t> mlir::tensor::getPackUnPackInverseDestPerm(
+    std::variant<tensor::PackOp, tensor::UnPackOp> op,
+    PackingMetadata &packingMetadata) {
 
   llvm::ArrayRef<int64_t> innerDimsPos, outerPerm;
-  RankedTensorType destType;
-  if (std::holds_alternative<tensor::PackOp>(op)) {
+  int64_t rank = 0;
+  bool isPackOp = std::holds_alternative<tensor::PackOp>(op);
+  if (isPackOp) {
     tensor::PackOp packOp = std::get<tensor::PackOp>(op);
     innerDimsPos = packOp.getInnerDimsPos();
-    destType = packOp.getDestType();
+    rank = packOp.getDestType().getRank();
     outerPerm = packOp.getOuterDimsPerm();
   } else {
     tensor::UnPackOp unpackOp = std::get<tensor::UnPackOp>(op);
     innerDimsPos = unpackOp.getInnerDimsPos();
-    destType = unpackOp.getDestType();
+    rank = unpackOp.getSourceType().getRank();
     outerPerm = unpackOp.getOuterDimsPerm();
   }
   // The permutation can be obtained from two permutations:
@@ -96,23 +104,21 @@ SmallVector<int64_t> mlir::tensor::getPackUnPackInverseDestPerm(
   //      has outer_dims_perm.
   // Apply (b) permutation on (a) permutation to get the final permutation.
   int64_t numPackedDims = innerDimsPos.size();
-  int64_t packedRank = destType.getRank();
-  auto lastDims = llvm::to_vector(
-      llvm::seq<int64_t>(packedRank - numPackedDims, packedRank));
-  PackingMetadata packingMetadata =
-      computePackingMetadata(destType.getRank(), innerDimsPos);
-  SmallVector<int64_t> innerPositionsPerm = computePermutationVector(
-      packedRank, lastDims, packingMetadata.insertPositions);
-
-  SmallVector<int64_t> outerPos = packingMetadata.outerPositions;
-  if (!outerPerm.empty())
-    applyPermutationToVector(outerPos, outerPerm);
-  SmallVector<int64_t> outerPositionPerm = computePermutationVector(
-      packedRank, packingMetadata.outerPositions, outerPos);
-
-  SmallVector<int64_t> packInverseDestPermutation = innerPositionsPerm;
-  applyPermutationToVector(packInverseDestPermutation, outerPositionPerm);
-  return packInverseDestPermutation;
+  auto lastDims =
+      llvm::to_vector(llvm::seq<int64_t>(rank - numPackedDims, rank));
+  packingMetadata = computePackingMetadata(rank, innerDimsPos);
+  SmallVector<int64_t> innerPositionsPerm =
+      computePermutationVector(rank, lastDims, packingMetadata.insertPositions);
+
+  if (isPackOp) {
+    SmallVector<int64_t> outerPos = packingMetadata.outerPositions;
+    if (!outerPerm.empty())
+      applyPermutationToVector(outerPos, outerPerm);
+    SmallVector<int64_t> outerPositionPerm = computePermutationVector(
+        rank, packingMetadata.outerPositions, outerPos);
+    applyPermutationToVector(innerPositionsPerm, outerPositionPerm);
+  }
+  return innerPositionsPerm;
 }
 
 bool mlir::tensor::isCastLikeInsertSliceOp(InsertSliceOp op) {
diff --git a/mlir/test/Dialect/Linalg/vectorization.mlir b/mlir/test/Dialect/Linalg/vectorization.mlir
@@ -691,12 +691,12 @@ func.func @test_vectorize_dynamic_shapes_unpack(%arg0: tensor<?x?xf32>, %arg1: t
 // CHECK: %[[CNST2:.*]] = arith.constant 2 : index
 // CHECK: %[[readMsk0:.*]] = vector.create_mask %[[DIM4]], %[[DIM6]], %[[CNST16]], %[[CNST2]] : vector<2x1x16x2xi1>
 // CHECK: %[[read0:.*]] = vector.mask %[[readMsk0]] {{.*}} vector.transfer_read %{{.*}} : tensor<?x?x16x2xf32>, vector<2x1x16x2xf32> } : vector<2x1x16x2xi1> -> vector<2x1x16x2xf32>
-// CHECK: %[[trans0:.*]] = vector.transpose %[[read0]], [0, 3, 1, 2] : vector<2x1x16x2xf32> to vector<2x2x1x16xf32>
-// CHEdCK: %[[sc0:.*]] = vector.shape_cast %[[trans0]] : vector<4x2x1x16xf32> to vector<4x16xf32>
-// CHEdCK: %[[empt0:.*]] = tensor.empty
-// CHEdCK: %[[writeMsk0:.*]] = vector.create_mask {{.*}} : vector<4x16xi1>
-// CHEdCK: %[[write0:.*]] = vector.mask %[[writeMsk0:.*]] {{.*}} vector.transfer_write %[[sc0]], %[[empt0]]
-// CHEdCK: return %[[write0]]
+// CHECK: %[[trans0:.*]] = vector.transpose %[[read0]], [0, 2, 3, 1] : vector<2x1x16x2xf32> to vector<2x16x2x1xf32>
+// CHECK: %[[sc0:.*]] = vector.shape_cast %[[trans0]] : vector<2x16x2x1xf32> to vector<32x2xf32>
+// CHECK: %[[empt0:.*]] = tensor.empty
+// CHECK: %[[writeMsk0:.*]] = vector.create_mask {{.*}} : vector<32x2xi1>
+// CHECK: %[[write0:.*]] = vector.mask %[[writeMsk0:.*]] {{.*}} vector.transfer_write %[[sc0]], %[[empt0]]
+// CHECK: return %[[write0]]
  %ret = tensor.unpack %arg1 inner_dims_pos = [1, 0] inner_tiles = [16, 2] into %arg0 : tensor<?x?x16x2xf32> -> tensor<?x?xf32>
  return %ret : tensor<?x?xf32>
 }
@@ -707,3 +707,58 @@ module attributes {transform.with_named_sequence} {
    transform.yield
  }
 }
+
+// -----
+
+// CHECK-LABEL: func @test_vectorize_unpack
+func.func @test_vectorize_unpack(%source: tensor<8x8x32x16xf32>, %dest: tensor<256x128xf32>) -> tensor<256x128xf32> {
+    // CHECK: %[[C0:.*]]= arith.constant 0 : index
+    // CHECK: %[[CST:.*]] = arith.constant 0.000000e+00 : f32
+    // CHECK: %[[C8:.*]] = arith.constant 8 : index
+    // CHECK: %[[C80:.*]] = arith.constant 8 : index
+    // CHECK: %[[C32:.*]] = arith.constant 32 : index
+    // CHECK: %[[C16:.*]] = arith.constant 16 : index
+    // CHECK: %[[MSK0:.*]] = vector.create_mask %c8, %c8_0, %c32, %c16 : vector<16x8x32x16xi1>
+    // CHECK: %[[READ0:.*]] = vector.mask %[[MSK0]] {{.*}} : vector<16x8x32x16xi1> -> vector<16x8x32x16xf32>
+    // CHECK: %[[TRANSP0:.*]] = vector.transpose %[[READ0]], [0, 2, 1, 3] : vector<16x8x32x16xf32> to vector<16x32x8x16xf32>
+    // CHECK: %[[SHAPC:.*]] = vector.shape_cast %[[TRANSP0]] : vector<16x32x8x16xf32> to vector<512x128xf32>
+    // CHECK: %[[EMPT:.*]] = tensor.empty() : tensor<256x128xf32>
+    // CHECK: %[[C01:.*]] = arith.constant 0 : index
+    // CHECK: %[[C256:.*]] = arith.constant 256 : index
+    // CHECK: %[[C128:.*]] = arith.constant 128 : index
+    // CHECK: %[[WRITEMSK:.*]] = vector.create_mask %[[C256]], %[[C128]] : vector<512x128xi1>
+    // CHECK: %[[WRIT:.*]] = vector.mask %[[WRITEMSK]] {{.*}} : vector<512x128xi1> -> tensor<256x128xf32>
+    // CHECK: return %[[WRIT]] : tensor<256x128xf32>
+   %0 = tensor.unpack %source inner_dims_pos = [0, 1] inner_tiles = [32, 16] into %dest : tensor<8x8x32x16xf32> -> tensor<256x128xf32>
+   return %0 : tensor<256x128xf32>
+ }
+ module attributes {transform.with_named_sequence} {
+  transform.named_sequence @__transform_main(%arg0: !transform.any_op {transform.readonly}) {
+    %0 = transform.structured.match ops{["tensor.unpack"]} in %arg0 : (!transform.any_op) -> !transform.any_op
+   transform.structured.vectorize %0 vector_sizes [512, 128] : !transform.any_op
+    transform.yield
+  } 
+}
+
+// -----
+
+// CHECK-LABEL: func @test_vectorize_unpack_no_masks
+func.func @test_vectorize_unpack_no_masks(%source: tensor<8x8x32x16xf32>, %dest: tensor<256x128xf32>) -> tensor<256x128xf32> {
+  // CHECK: %[[C0:.*]] = arith.constant 0 : index
+  // CHECK: %[[CST:.*]] = arith.constant 0.000000e+00 : f32
+  // CHECK: %[[READ:.*]] = vector.transfer_read {{.*}} : tensor<8x8x32x16xf32>, vector<8x8x32x16xf32>
+  // CHECK: %[[TRANSP:.*]] = vector.transpose %[[READ]], [0, 2, 1, 3] : vector<8x8x32x16xf32> to vector<8x32x8x16xf32>
+  // CHECK: %[[SHAPC:.*]] = vector.shape_cast %[[TRANSP]] : vector<8x32x8x16xf32> to vector<256x128xf32>
+  // CHECK: %[[EMPT:.*]] = tensor.empty() : tensor<256x128xf32>
+  // CHECK: %[[C00:.*]] = arith.constant 0 : index
+  // CHECK: %[[WRIT:.*]] = vector.transfer_write %[[SHAPC]], {{.*}} : vector<256x128xf32>, tensor<256x128xf32>
+  // CHECK: return %[[WRIT]] : tensor<256x128xf32>
+   %0 = tensor.unpack %source inner_dims_pos = [0, 1] inner_tiles = [32, 16] into %dest : tensor<8x8x32x16xf32> -> tensor<256x128xf32>
+   return %0 : tensor<256x128xf32>
+ }
+ module attributes {transform.with_named_sequence} {
+  transform.named_sequence @__transform_main(%arg0: !transform.any_op {transform.readonly}) {
+    %0 = transform.structured.match ops{["tensor.unpack"]} in %arg0 : (!transform.any_op) -> !transform.any_op
+   transform.structured.vectorize %0 vector_sizes [256, 128] : !transform.any_op
+    transform.yield
+  } }