[mlir][bufferization] Fix bufferization of repetitive regions

matthias-springer · matthias-springer · commit c89c31a2306e · 2023-02-06T16:23:08.000+01:00
The previous strategy was too complex and faulty. Op dominance cannot be used to rule out RaW conflicts due to op ordering if the reading op and the conflicting writing op are in a sub repetitive region of the closest enclosing repetitive region of the definition of the read value. Differential Revision: https://reviews.llvm.org/D143087
diff --git a/mlir/include/mlir/Dialect/Bufferization/IR/BufferizableOpInterface.h b/mlir/include/mlir/Dialect/Bufferization/IR/BufferizableOpInterface.h
@@ -575,6 +575,11 @@ Region *getEnclosingRepetitiveRegion(Value value,
 Region *getEnclosingRepetitiveRegion(Block *block,
                                      const BufferizationOptions &options);
 
+/// Assuming that the given region is repetitive, find the next enclosing
+/// repetitive region.
+Region *getNextEnclosingRepetitiveRegion(Region *region,
+                                         const BufferizationOptions &options);
+
 namespace detail {
 /// This is the default implementation of
 /// BufferizableOpInterface::getAliasingOpOperands. Should not be called from
diff --git a/mlir/lib/Dialect/Bufferization/IR/BufferizableOpInterface.cpp b/mlir/lib/Dialect/Bufferization/IR/BufferizableOpInterface.cpp
@@ -41,6 +41,15 @@ MLIR_DEFINE_EXPLICIT_TYPE_ID(mlir::bufferization::AnalysisState)
 using namespace mlir;
 using namespace bufferization;
 
+static bool isRepetitiveRegion(Region *region,
+                               const BufferizationOptions &options) {
+  Operation *op = region->getParentOp();
+  if (auto bufferizableOp = options.dynCastBufferizableOp(op))
+    if (bufferizableOp.isRepetitiveRegion(region->getRegionNumber()))
+      return true;
+  return false;
+}
+
 Region *bufferization::getEnclosingRepetitiveRegion(
     Operation *op, const BufferizationOptions &options) {
   if (!op->getBlock())
@@ -52,11 +61,9 @@ Region *bufferization::getEnclosingRepetitiveRegion(
     Value value, const BufferizationOptions &options) {
   Region *region = value.getParentRegion();
   while (region) {
-    Operation *op = region->getParentOp();
-    if (auto bufferizableOp = options.dynCastBufferizableOp(op))
-      if (bufferizableOp.isRepetitiveRegion(region->getRegionNumber()))
-        return region;
-    region = op->getParentRegion();
+    if (isRepetitiveRegion(region, options))
+      return region;
+    region = region->getParentRegion();
   }
   return nullptr;
 }
@@ -67,13 +74,22 @@ Region *bufferization::getEnclosingRepetitiveRegion(
   Operation *op = nullptr;
   do {
     op = region->getParentOp();
-    if (auto bufferizableOp = options.dynCastBufferizableOp(op))
-      if (bufferizableOp.isRepetitiveRegion(region->getRegionNumber()))
-        return region;
+    if (isRepetitiveRegion(region, options))
+      return region;
   } while ((region = op->getParentRegion()));
   return nullptr;
 }
 
+Region *bufferization::getNextEnclosingRepetitiveRegion(
+    Region *region, const BufferizationOptions &options) {
+  assert(isRepetitiveRegion(region, options) && "expected repetitive region");
+  while ((region = region->getParentRegion())) {
+    if (isRepetitiveRegion(region, options))
+      break;
+  }
+  return region;
+}
+
 Operation *bufferization::getOwnerOfValue(Value value) {
   if (auto opResult = value.dyn_cast<OpResult>())
     return opResult.getDefiningOp();
diff --git a/mlir/lib/Dialect/Bufferization/Transforms/OneShotAnalysis.cpp b/mlir/lib/Dialect/Bufferization/Transforms/OneShotAnalysis.cpp
@@ -346,25 +346,27 @@ static bool happensBefore(Operation *a, Operation *b,
   return false;
 }
 
-/// Return `true` if op dominance can be used to rule out read-after-write
-/// conflicts wrt. the given reads and writes.
+/// Return `true` if op dominance can be used to rule out a read-after-write
+/// conflicts based on the ordering of ops.
 ///
-/// Op dominance can often be used to rule out potential conflicts such as
-/// "read" happens before "write". E.g., the following IR is not a RaW conflict
-/// because the the read happens *before* the write.
+/// Generalized op dominance can often be used to rule out potential conflicts
+/// due to "read happens before write". E.g., the following IR is not a RaW
+/// conflict because the read happens *before* the write.
 ///
-/// %0 = ... : tensor<?xf32>
-/// "reading_op"(%0) : tensor<?xf32>
-/// %1 = "writing_op"(%0) : tensor<?xf32> -> tensor<?xf32>
+/// Example 1:
+/// %0 = ... : tensor<?xf32>                                // DEF
+/// "reading_op"(%0) : tensor<?xf32>                        // READ
+/// %1 = "writing_op"(%0) : tensor<?xf32> -> tensor<?xf32>  // WRITE
 ///
 /// This is no longer true inside loops (or repetitive regions). In such cases,
 /// there may not be a meaningful `happensBefore` relationship because ops
 /// could be executed multiple times. E.g.:
 ///
-/// %0 = ... : tensor<?xf32>
+/// Example 2:
+/// %0 = ... : tensor<?xf32>                                  // DEF
 /// scf.for ... {
-///   "reading_op"(%0) : tensor<?xf32>
-///   %1 = "writing_op"(%0) : tensor<?xf32> -> tensor<?xf32>
+///   "reading_op"(%0) : tensor<?xf32>                        // READ
+///   %1 = "writing_op"(%0) : tensor<?xf32> -> tensor<?xf32>  // WRITE
 ///   ...
 /// }
 ///
@@ -374,92 +376,78 @@ static bool happensBefore(Operation *a, Operation *b,
 /// execution of writing_op. This is problematic because the tensor %0 they
 /// operate on (i.e., the "definition") is defined outside of the loop.
 ///
-/// Counter example:
+/// On a high-level, there is a potential RaW in a program if there exists a
+/// possible program execution such that there is a sequence of DEF, followed
+/// by WRITE, followed by READ. Each additional DEF resets the sequence.
 ///
+/// E.g.:
+/// No conflict:        DEF, WRITE, DEF, READ
+/// Potential conflict: DEF, READ, WRITE, READ, WRITE
+///
+/// Example 1 has no conflict:          DEF, READ, WRITE
+/// Example 2 has a potential conflict: DEF, (READ, WRITE)*
+//
+/// Example 3:
 /// scf.for ... {
 ///   %0 = ... : tensor<?xf32>
 ///   "reading_op"(%0) : tensor<?xf32>
 ///   %1 = "writing_op"(%0) : tensor<?xf32> -> tensor<?xf32>
 ///   ...
 /// }
+/// This has no conflict: (DEF, READ, WRITE)*
 ///
-/// In this example, the definition %0 is in the same repetitive region as
-/// "writing_op", so op dominance can be used to compute the `happensBefore`
-/// relationship.
-///
-/// Whether op dominance can be used or not is decided as follows: Find the
-/// closest enclosing repetitive region of all buffer writes wrt. the given
-/// tensor reads and writes. (The given sets of reads and writes contain the
-/// entire alias set.) In case of a read, we look at the op that defines the
-/// read value. In case of a write, we look at the op that is writing. If all of
-/// those ops are in the same closest enclosing repetitive region (nullptr in
-/// case of "no repetitive region" found at all), then op dominance can be used.
-/// Otherwise, it cannot be used.
-///
-/// Example: The common enclosing repetitive region is the scf.for loop.
-///          Op dominance can be used.
+/// Example 4:
+/// %0 = ... : tensor<?xf32>
 /// scf.for ... {
-///   %0 = tensor.generate
-///   "read"(%0)
+///   scf.for ... { "reading_op"(%0) }
+///   %1 = "writing_op"(%0)
 /// }
+/// This has a potential conflict: DEF, ((READ)*, WRITE)*
 ///
-/// Example: The common enclosing repetitive region is nullptr: There is no
-///          repetitive region around the tensor.generate. Op dominance can be
-///          used.
-/// %0 = tensor.generate
-/// scf.for ... { "read"(%0) }
+/// Example 5:
+/// %0 = ... : tensor<?xf32>
+/// scf.for ... { %1 = "writing_op"(%0) }
+/// scf.for ... { "reading_op"(%0) }
+/// This has a potential conflict: DEF, WRITE*, READ*
 ///
-/// Example: The common enclosing repetitive regions of tensor.generate and
-///          "write" differ. Op dominance cannot be used.
-/// %0 = tensor.generate
-/// scf.for ... {
-///   "read"(%0)
-///   "write"(%0)
-/// }
+/// The following rules are used to rule out RaW conflicts via ordering of ops:
 ///
-/// Example: The common enclosing repetitive regions of tensor.generate and
-///          "write" differ, but there is no read of %0, so op dominance can be
-///          used.
-/// %0 = tensor.generate
-/// scf.for ... {
-///   "write"(%0)
-/// }
+/// 1. If the closest enclosing repetitive region of DEF is a proper ancestor of
+///    a repetitive region that enclosing both READ and WRITE, we cannot rule
+///    out RaW conflict due to the ordering of ops.
+/// 2. Otherwise: There are no loops that interfere with our analysis; for
+///    analysis purposes, we can assume that there are no loops/repetitive
+///    regions. I.e., we can rule out a RaW conflict if READ happensBefore WRITE
+///    or WRITE happensBefore DEF. (Checked in `hasReadAfterWriteInterference`.)
 ///
-/// Note: iter_args of loops are not aliases of their respective block
-/// arguments, so op domanice can be used when analyzing ops that operate
-/// on them.
-bool canUseOpDominance(const DenseSet<OpOperand *> &usesRead,
-                       const DenseSet<OpOperand *> &usesWrite,
+bool canUseOpDominance(OpOperand *uRead, OpOperand *uWrite,
+                       const SetVector<Value> &definitions,
                        const AnalysisState &state) {
   const BufferizationOptions &options = state.getOptions();
-  std::optional<Region *> commonEnclosingRegion;
+  for (Value def : definitions) {
+    Region *rRead = getEnclosingRepetitiveRegion(uRead->getOwner(), options);
+    Region *rDef = getEnclosingRepetitiveRegion(def, options);
 
-  // In case of a write, take the region in which the write takes place.
-  for (OpOperand *uWrite : usesWrite) {
-    Region *r = getEnclosingRepetitiveRegion(uWrite->getOwner(), options);
-    if (!commonEnclosingRegion.has_value()) {
-      commonEnclosingRegion = r;
+    // READ and DEF are in the same repetitive region. `happensBefore` can be
+    // used to rule out RaW conflicts due to op ordering.
+    if (rRead == rDef)
       continue;
-    }
-    if (*commonEnclosingRegion != r)
-      return false;
-  }
 
-  // In case of a read, take the region which the read value is defined.
-  for (OpOperand *uRead : usesRead) {
-    // Optimization: Skip reads of values that have no defined contents.
-    if (!state.bufferizesToMemoryWrite(uRead->get()))
-      continue;
-    Region *r = getEnclosingRepetitiveRegion(uRead->get(), options);
-    if (!commonEnclosingRegion.has_value()) {
-      commonEnclosingRegion = r;
-      continue;
+    // Find the enclosing repetitive region of READ that is closest to DEF but
+    // not the repetitive region of DEF itself.
+    while (true) {
+      Region *nextRegion = getNextEnclosingRepetitiveRegion(rRead, options);
+      if (nextRegion == rDef)
+        break;
+      assert(nextRegion && "expected to find another repetitive region");
+      rRead = nextRegion;
     }
-    if (*commonEnclosingRegion != r)
+
+    // We cannot use op dominance if WRITE is inside the same repetitive region.
+    if (rRead->getParentOp()->isAncestor(uWrite->getOwner()))
       return false;
   }
-
-  return commonEnclosingRegion.has_value();
+  return true;
 }
 
 /// Annotate IR with details about the detected RaW conflict.
@@ -507,10 +495,6 @@ static bool hasReadAfterWriteInterference(
     AnalysisState &state, const BufferizationAliasInfo &aliasInfo) {
   const BufferizationOptions &options = state.getOptions();
 
-  // Check if op dominance can be used to rule out read-after-write conflicts.
-  bool useDominance = canUseOpDominance(usesRead, usesWrite, state);
-  LLVM_DEBUG(llvm::dbgs() << "\n- useDominance = " << useDominance << "\n");
-
   for (OpOperand *uRead : usesRead) {
     Operation *readingOp = uRead->getOwner();
     LLVM_DEBUG(llvm::dbgs() << "\n- check conflict:\n");
@@ -542,6 +526,12 @@ static bool hasReadAfterWriteInterference(
                               << uConflictingWrite->getOperandNumber() << " of "
                               << *uConflictingWrite->getOwner() << "\n");
 
+      // Check if op dominance can be used to rule out read-after-write
+      // conflicts.
+      bool useDominance =
+          canUseOpDominance(uRead, uConflictingWrite, definitions, state);
+      LLVM_DEBUG(llvm::dbgs() << "\n- useDominance = " << useDominance << "\n");
+
       // Throughout this loop, check for multiple requirements that have to be
       // met for uConflictingWrite to be an actual conflict.
       Operation *conflictingWritingOp = uConflictingWrite->getOwner();
diff --git a/mlir/test/Dialect/SCF/one-shot-bufferize-analysis.mlir b/mlir/test/Dialect/SCF/one-shot-bufferize-analysis.mlir
@@ -697,3 +697,104 @@ func.func @no_raw_conflict_after_repetitive_use(%arg0: tensor<4xf32>,
 
   return %2, %7 : tensor<4xf32>, tensor<4xf32>
 }
+
+// -----
+
+// CHECK-LABEL: func @read_of_bbarg_in_repetitive_region(
+func.func @read_of_bbarg_in_repetitive_region(
+    %t: tensor<10xf32>, %a: index, %b: index, %c: index, %cst: f32) {
+  // CHECK: scf.for
+  scf.for %iv = %a to %b step %c {
+    // Must bufferize out-of-place because definition of read is in a different
+    // repetitive region.
+    // CHECK: tensor.extract_slice {{.*}} {__inplace_operands_attr__ = ["false"]}
+    %2 = tensor.extract_slice %t[0][4][1] : tensor<10xf32> to tensor<4xf32>
+    %3 = tensor.extract %2[%a] : tensor<4xf32>
+    vector.print %3 : f32
+    // CHECK: tensor.insert {{.*}} {__inplace_operands_attr__ = ["none", "true", "none"]}
+    %4 = tensor.insert %cst into %2[%a] : tensor<4xf32>
+    %5 = tensor.extract %4[%a] : tensor<4xf32>
+    vector.print %5 : f32
+  }
+  return
+}
+
+// -----
+
+// CHECK-LABEL: func @read_definition_in_same_repetitive_region_as_write(
+func.func @read_definition_in_same_repetitive_region_as_write(
+    %t: tensor<10xf32>, %a: index, %b: index, %c: index, %cst: f32) {
+  // CHECK: tensor.insert {{.*}} {__inplace_operands_attr__ = ["none", "true", "none"]}
+  %1 = tensor.insert %cst into %t[%a] : tensor<10xf32>
+  // CHECK: scf.for
+  scf.for %iv = %a to %b step %c {
+    // Can bufferize in-place.
+    // CHECK: tensor.extract_slice {{.*}} {__inplace_operands_attr__ = ["true"]}
+    %2 = tensor.extract_slice %1[0][4][1] : tensor<10xf32> to tensor<4xf32>
+    %3 = tensor.extract %2[%a] : tensor<4xf32>
+    vector.print %3 : f32
+  }
+  return
+}
+
+// -----
+
+// CHECK-LABEL: func @read_definition_in_same_repetitive_region_as_conflicting_write(
+func.func @read_definition_in_same_repetitive_region_as_conflicting_write(
+    %t: tensor<10xf32>, %a: index, %b: index, %c: index, %cst: f32) {
+  // Cannot bufferize in-place according to normal op dominance rules.
+  // CHECK: tensor.insert {{.*}} {__inplace_operands_attr__ = ["none", "false", "none"]}
+  %1 = tensor.insert %cst into %t[%a] : tensor<10xf32>
+  // CHECK: scf.for
+  scf.for %iv = %a to %b step %c {
+    // CHECK: tensor.extract_slice {{.*}} {__inplace_operands_attr__ = ["true"]}
+    %2 = tensor.extract_slice %t[0][4][1] : tensor<10xf32> to tensor<4xf32>
+    %3 = tensor.extract %2[%a] : tensor<4xf32>
+    vector.print %3 : f32
+  }
+  return
+}
+
+// -----
+
+// CHECK: func @write_value_in_repetitive_region(
+func.func @write_value_in_repetitive_region(
+    %t: tensor<10xf32>, %a: index, %b: index, %c: index, %cst: f32) {
+  %0 = tensor.extract %t[%a] : tensor<10xf32>
+  vector.print %0 : f32
+
+  scf.for %iv = %a to %b step %c {
+    // No further read of %0, so this can bufferize in-place.
+    // CHECK: tensor.extract_slice {{.*}} {__inplace_operands_attr__ = ["true"]}
+    %2 = tensor.extract_slice %t[0][4][1] : tensor<10xf32> to tensor<4xf32>
+    // CHECK: linalg.fill {__inplace_operands_attr__ = ["none", "true"]}
+    %filled = linalg.fill ins(%cst : f32) outs(%2 : tensor<4xf32>) -> tensor<4xf32>
+    %3 = tensor.extract %filled[%a] : tensor<4xf32>
+    vector.print %3 : f32
+  }
+  return
+}
+
+// -----
+
+// CHECK-LABEL: func @nesting_op_repetitive_regions(
+func.func @nesting_op_repetitive_regions(
+    %t: tensor<10xf32>, %a: index, %b: index, %c: index, %cst: f32) {
+  // Cannot bufferize in-place according to normal op dominance rules.
+  // CHECK: tensor.insert {{.*}} {__inplace_operands_attr__ = ["none", "false", "none"]}
+  %1 = tensor.insert %cst into %t[%a] : tensor<10xf32>
+  // CHECK: scf.for
+  scf.for %iv1 = %a to %b step %c {
+    // CHECK: scf.for
+    scf.for %iv2 = %a to %b step %c {
+      // CHECK: scf.for
+      scf.for %iv3 = %a to %b step %c {
+        // CHECK: tensor.extract_slice {{.*}} {__inplace_operands_attr__ = ["true"]}
+        %2 = tensor.extract_slice %t[0][4][1] : tensor<10xf32> to tensor<4xf32>
+        %3 = tensor.extract %2[%a] : tensor<4xf32>
+        vector.print %3 : f32
+      }
+    }
+  }
+  return
+}