[mlir][vector] Fix off-by-one error in getTransferChunkAccessed (llvm#70292)

matthias-springer · web-flow · commit e6e55e620bb6 · 2023-10-27T17:39:55.000+09:00
If a dimension does not appear in the permutation map of a vector
transfer op, the size of the accessed slice in that dimension is `1`.
Before this fix, `getTransferChunkAccessed` used to return `0` for such
dimensions, which would means that `0` elements in the underlying
tensor/memref are accessed.

Note: There is no test case that fails due to this bug and because this
interface method is currently only used in one place, it is hard to
write a regression test. This fix is in preparation of subset hoisting
functionality that will be added in subsequent commits.
diff --git a/mlir/include/mlir/Interfaces/VectorInterfaces.td b/mlir/include/mlir/Interfaces/VectorInterfaces.td
@@ -257,22 +257,22 @@ def VectorTransferOpInterface : OpInterface<"VectorTransferOpInterface"> {
     >,
     InterfaceMethod<
       /*desc=*/[{
-      Return an upper-bound shape accessed by the transfer op within the
-      tensor/memref operand.
+      Return the shape of the hyperrectangular slice within the tensor/memref
+      operand that is accessed by the transfer op.
       For example:
       ```
-        vector.transfer %w0[%i, %j] {
-          permutation_map = affine_map<(d0, d1) -> (d1, d0, 0)>} :
-          tensor<?x?xf32>, vector<4x2x6xf32>
+        vector.transfer %w0[%i, %j, %k] {
+          permutation_map = affine_map<(d0, d1, d2) -> (d1, d0, 0)>} :
+          tensor<?x?x?xf32>, vector<4x2x6xf32>
       ```
-      returns a shape [2, 4].
+      returns a shape [2, 4, 1].
       }],
       /*retTy=*/"SmallVector<int64_t>",
       /*methodName=*/"getTransferChunkAccessed",
       /*args=*/(ins),
       /*methodBody=*/"",
       /*defaultImplementation=*/[{
-        SmallVector<int64_t> dimSizes($_op.getPermutationMap().getNumDims(), 0);
+        SmallVector<int64_t> dimSizes($_op.getPermutationMap().getNumDims(), 1);
         for (auto vecDims : llvm::zip($_op.getPermutationMap().getResults(),
                                       $_op.getVectorType().getShape())) {
           AffineExpr dim = std::get<0>(vecDims);
diff --git a/mlir/lib/Dialect/Vector/IR/VectorOps.cpp b/mlir/lib/Dialect/Vector/IR/VectorOps.cpp
@@ -4004,35 +4004,36 @@ struct TransferReadAfterWriteToBroadcast
     auto defWrite = readOp.getSource().getDefiningOp<vector::TransferWriteOp>();
     if (!defWrite)
       return failure();
-
-    SmallVector<int64_t> readDims = readOp.getTransferChunkAccessed();
-    Value vec;
-    if (readOp.getIndices() == defWrite.getIndices() &&
-        readOp.getMask() == defWrite.getMask()) {
-      SmallVector<int64_t> writeDims = defWrite.getTransferChunkAccessed();
-      // TODO: If the writeDim is a superset of the read dims we could do an
-      // extract_strided_slice.
-      if (writeDims == readDims)
-        vec = defWrite.getVector();
-    }
+    // TODO: If the written transfer chunk is a superset of the read transfer
+    // chunk we could do an extract_strided_slice.
+    if (readOp.getTransferChunkAccessed() !=
+        defWrite.getTransferChunkAccessed())
+      return failure();
+    // TODO: Support cases where a dim is explicitly written but implicitly
+    // read (i.e., a unit dim that is rank reduced).
+    if (getUnusedDimsBitVector({readOp.getPermutationMap()}) !=
+        getUnusedDimsBitVector({defWrite.getPermutationMap()}))
+      return failure();
+    if (readOp.getIndices() != defWrite.getIndices() ||
+        readOp.getMask() != defWrite.getMask())
+      return failure();
+    Value vec = defWrite.getVector();
     // TODO: loop through the chain of transfer_write if we can prove that they
     // don't overlap with the transfer_read. This requires improving
     // `isDisjointTransferIndices` helper.
-    if (!vec)
-      return failure();
-    SmallVector<unsigned> permutation;
     AffineMap readMap = compressUnusedDims(readOp.getPermutationMap());
     AffineMap writeMap = compressUnusedDims(defWrite.getPermutationMap());
     AffineMap map = readMap.compose(writeMap);
     if (map.getNumResults() == 0)
       return failure();
-    // Calculate the permuation to apply to go from the vector stored to the
+    // Calculate the permutation to apply to go from the vector stored to the
     // vector read.
+    SmallVector<unsigned> permutation;
     if (!map.isPermutationOfMinorIdentityWithBroadcasting(permutation))
       return failure();
 
     Location loc = readOp.getLoc();
-    // Calculate the broadcast shape by applying the reverse permuation to the
+    // Calculate the broadcast shape by applying the reverse permutation to the
     // final shape we want.
     ArrayRef<int64_t> destShape = readOp.getVectorType().getShape();
     SmallVector<int64_t> broadcastShape(destShape.size());
diff --git a/mlir/test/Dialect/Vector/canonicalize.mlir b/mlir/test/Dialect/Vector/canonicalize.mlir
@@ -2400,3 +2400,27 @@ func.func @fold_shape_cast_with_constant_mask() -> vector<4xi1>{
   %2 = vector.shape_cast %1 : vector<4x1x1xi1> to vector<4xi1>
   return %2 : vector<4xi1>
 }
+
+// -----
+
+// TODO: This IR could be canonicalized but the canonicalization pattern is not
+// smart enough. For now, just make sure that we do not crash.
+
+// CHECK-LABEL: func.func @load_store_forwarding_rank_mismatch(
+//       CHECK:   vector.transfer_write
+//       CHECK:   vector.transfer_read
+func.func @load_store_forwarding_rank_mismatch(%v0: vector<4x1x1xf32>, %arg0: tensor<4x4x4xf32>) -> (vector<1x100x4x5xf32>) {
+  %c0 = arith.constant 0 : index
+  %cf0 = arith.constant 0.0 : f32
+  // d0 is explicitly written.
+  %w0 = vector.transfer_write %v0, %arg0[%c0, %c0, %c0]
+      {in_bounds = [true, true, true],
+      permutation_map = affine_map<(d0, d1, d2) -> (d2, d1, d0)>} :
+      vector<4x1x1xf32>, tensor<4x4x4xf32>
+  // d0 is implicitly read (rank-reduction of unit dim).
+  %r = vector.transfer_read %w0[%c0, %c0, %c0], %cf0
+      {in_bounds = [true, true, true, true],
+      permutation_map = affine_map<(d0, d1, d2) -> (d1, 0, d2, 0)>} :
+      tensor<4x4x4xf32>, vector<1x100x4x5xf32>
+  return %r : vector<1x100x4x5xf32>
+}