[mlir][bufferization] Remove resolveUsesInRepetitiveRegions (llvm#67927)

matthias-springer · web-flow · commit c95fcd343d40 · 2023-10-02T16:04:27.000+02:00
The bufferization analysis has been improved over the last months and
this workaround is no longer needed.
diff --git a/mlir/lib/Dialect/Bufferization/Transforms/TensorCopyInsertion.cpp b/mlir/lib/Dialect/Bufferization/Transforms/TensorCopyInsertion.cpp
@@ -26,82 +26,9 @@ namespace bufferization {
 using namespace mlir;
 using namespace mlir::bufferization;
 
-/// Resolve all operands that are also used inside of repetitive regions of the
-/// same op. Such cases are not fully supported by One-Shot Bufferize.
-///
-/// E.g.:
-/// %r = scf.for ... iter_args(%t = %tensor) -> tensor<?xf32> {
-///   "some_use"(%tensor)
-///   ...
-/// }
-///
-/// Is converted to:
-/// %tensor_copy = bufferization.alloc_tensor copy(%tensor)
-/// %r = scf.for ... iter_args(%t = %tensor) -> tensor<?xf32> {
-///   "some_use"(%tensor_copy)
-///   ...
-/// }
-static void
-resolveUsesInRepetitiveRegions(Operation *op,
-                               const BufferizationOptions &options) {
-  IRRewriter rewriter(op->getContext());
-  AnalysisState state(options);
-
-  // Look for repetitive ops (loops).
-  op->walk([&](BufferizableOpInterface bufferizableOp) {
-    // Skip filtered ops.
-    if (!options.isOpAllowed(bufferizableOp.getOperation()))
-      return WalkResult::advance();
-
-    // Find all operands that are also used inside of a repetitive region of
-    // this op.
-    for (OpOperand &opOperand : bufferizableOp->getOpOperands()) {
-      Value operand = opOperand.get();
-      // Skip non-tensor operands.
-      if (!isa<TensorType>(operand.getType()))
-        continue;
-      // Skip operands that do not bufferize to memory writes.
-      if (!bufferizableOp.bufferizesToMemoryWrite(opOperand, state))
-        continue;
-
-      // Gather all uses inside repetitive regions.
-      SmallVector<OpOperand *> usesInsideRegion;
-      for (OpOperand &use : operand.getUses()) {
-        Operation *owner = use.getOwner();
-        if (!bufferizableOp->isProperAncestor(owner))
-          continue;
-        for (Region &r : bufferizableOp->getRegions()) {
-          if (r.findAncestorOpInRegion(*owner) &&
-              bufferizableOp.isRepetitiveRegion(r.getRegionNumber())) {
-            usesInsideRegion.push_back(&use);
-            break;
-          }
-        }
-      }
-      // Nothing to do if the operand is not used inside a repetitive region.
-      if (usesInsideRegion.empty())
-        continue;
-
-      // Insert a tensor copy and replace all uses inside of repetitive regions.
-      rewriter.setInsertionPoint(bufferizableOp);
-      auto tensorCopy = rewriter.create<AllocTensorOp>(
-          bufferizableOp->getLoc(), cast<TensorType>(operand.getType()),
-          /*dynamicSizes=*/ValueRange(),
-          /*copy=*/operand, /*memory_space=*/IntegerAttr());
-      for (OpOperand *use : usesInsideRegion)
-        use->set(tensorCopy);
-    }
-
-    return WalkResult::advance();
-  });
-}
-
 LogicalResult mlir::bufferization::insertTensorCopies(
     Operation *op, const OneShotBufferizationOptions &options,
     BufferizationStatistics *statistics) {
-  // Preprocessing: Resolve currently unsupported bufferization cases.
-  resolveUsesInRepetitiveRegions(op, options);
-
   OneShotAnalysisState state(op, options);
   // Run normal One-Shot Bufferize analysis or One-Shot Module Bufferize
   // analysis depending on whether function boundary bufferization is enabled or
diff --git a/mlir/test/Dialect/Linalg/one-shot-bufferize.mlir b/mlir/test/Dialect/Linalg/one-shot-bufferize.mlir
@@ -160,8 +160,7 @@ func.func @matmul(
   %c16 = arith.constant 16 : index
 
   // Hoisted alloc.
-  // CHECK: %[[ALLOC:.*]] = memref.alloc() {alignment = 64 : i64} : memref<128x192xf32>
-  // CHECK: memref.copy %[[C]], %[[ALLOC]]
+  // CHECK: %[[ALLOC:.*]] = memref.alloc() {alignment = 64 : i64} : memref<8x16xf32>
 
   // CHECK: scf.for %[[I:.*]] =
   %0 = scf.for %arg3 = %c0 to %c128 step %c8 iter_args(%arg4 = %C) -> (tensor<128x192xf32>) {
@@ -173,14 +172,12 @@ func.func @matmul(
       %3 = tensor.extract_slice %B[0, %arg5] [256, 16] [1, 1] :
         tensor<256x192xf32> to tensor<256x16xf32>
 
-      // C was already replaced with a copy by preprocessing, so no copy is
-      // needed here.
-      // CHECK: %[[C_SLICE:.*]] = memref.subview %[[ALLOC]]
+      // Insert an artificial out-of-place buffer by extracting from %C instead
+      // of %arg6.
       %4 = tensor.extract_slice %C[%arg3, %arg5] [8, 16] [1, 1] :
         tensor<128x192xf32> to tensor<8x16xf32>
 
-      // linalg.fill is inplace.
-      // CHECK: linalg.fill ins(%{{.*}} : f32) outs(%[[C_SLICE]]
+      // CHECK: linalg.fill ins(%{{.*}} : f32) outs(%[[ALLOC]]
       %5 = linalg.fill ins(%cst : f32) outs(%4 : tensor<8x16xf32>) -> tensor<8x16xf32>
 
       // CHECK: scf.for %[[K:.*]] =
@@ -191,7 +188,7 @@ func.func @matmul(
           tensor<256x16xf32> to tensor<32x16xf32>
 
         // linalg.matmul is inplace as well as the enclosing scf.for.
-        // CHECK: linalg.matmul ins({{.*}} outs(%[[C_SLICE]]
+        // CHECK: linalg.matmul ins({{.*}} outs(%[[ALLOC]]
         %10 = linalg.matmul ins(%8, %9 : tensor<8x32xf32>, tensor<32x16xf32>)
                            outs(%arg8 : tensor<8x16xf32>)
           -> tensor<8x16xf32>
@@ -202,7 +199,7 @@ func.func @matmul(
       // that is not in place. So we must insert a copy of the small buffer into
       // the bigger buffer.
       // CHECK: %[[T:.*]] = memref.subview %[[C]][%[[I]], %[[J]]] [8, 16] [1, 1]
-      // CHECK: memref.copy %[[C_SLICE]], %[[T]]
+      // CHECK: memref.copy %[[ALLOC]], %[[T]]
       %7 = tensor.insert_slice %6 into %arg6[%arg3, %arg5] [8, 16] [1, 1] :
         tensor<8x16xf32> into tensor<128x192xf32>
 
diff --git a/mlir/test/Dialect/SCF/one-shot-bufferize.mlir b/mlir/test/Dialect/SCF/one-shot-bufferize.mlir
@@ -253,11 +253,10 @@ func.func @scf_execute_region_yield_non_equivalent(%i: index, %j: index) -> f32
 //  CHECK-SAME:     %[[t:.*]]: memref<?xf32
 //       CHECK:   %[[alloc:.*]] = memref.alloc(%{{.*}})
 //       CHECK:   memref.copy %[[t]], %[[alloc]]
-//       CHECK:   %[[for:.*]] = scf.for {{.*}} iter_args(%[[iter:.*]] = %[[t]])
+//       CHECK:   %[[for:.*]] = scf.for {{.*}} iter_args(%[[iter:.*]] = %[[alloc]])
 //   CHECK-DAG:     %[[alloc2:.*]] = memref.alloc(%{{.*}})
-//       CHECK:     memref.copy %[[alloc]], %[[alloc2]]
-//       CHECK:     %[[alloc2_casted:.*]] = memref.cast %[[alloc2]]
-//       CHECK:     scf.yield %[[alloc2_casted]]
+//       CHECK:     memref.copy %[[t]], %[[alloc2]]
+//       CHECK:     scf.yield %[[alloc2]]
 //       CHECK:   return %[[for]]
 func.func @scf_for_yield_non_equivalent(
     %t: tensor<?xf32>, %lb : index, %ub : index, %step : index) -> tensor<?xf32> {
@@ -606,9 +605,9 @@ func.func @scf_foreach_private_var(%t: tensor<10xf32>) -> f32 {
 
   // CHECK: scf.forall (%{{.*}}) in (2) {
 
-  // Load from the copy and store into the shared output.
-  // CHECK:   %[[subview:.*]] = memref.subview %[[t]]
-  // CHECK:   memref.load %[[t_copy]]
+  // Load from the original and store into the copy.
+  // CHECK:   %[[subview:.*]] = memref.subview %[[t_copy]]
+  // CHECK:   memref.load %[[t]]
   // CHECK:   memref.store %{{.*}}, %[[subview]]
   %0 = scf.forall (%tid) in (%c2) shared_outs(%o = %t) -> tensor<10xf32> {
     %offset = arith.muli %c5, %tid : index
@@ -752,14 +751,16 @@ func.func @scf_for_yield_alias_of_non_equivalent(%sz: index) -> tensor<?xf32> {
   // CHECK: scf.for
   %r = scf.for %iv = %c0 to %sz step %c1 iter_args(%t = %0) -> tensor<?xf32> {
     %iv_sub = arith.subi %iv, %c1 : index
-    // CHECK: memref.subview %[[generate_copy]]
+    // CHECK: memref.subview %[[generate]]
     %ll = tensor.extract_slice %0[%iv_sub][%sz][1] : tensor<?xf32> to tensor<?xf32>
     %l = tensor.extract %ll[%c0] : tensor<?xf32>
     %double = arith.mulf %cst, %l : f32
-    // CHECK: memref.store %{{.*}}, %[[generate]]
+    // CHECK: memref.store %{{.*}}, %[[generate_copy]]
     %s = tensor.insert %double into %t[%iv] : tensor<?xf32>
     scf.yield %s : tensor<?xf32>
   }
+
+  // CHECK: return %[[generate_copy]]
   return %r : tensor<?xf32>
 }