[mlir][Linalg] Extend buffer allocation to support Linalg init tensors

nicolasvasilache · nicolasvasilache · commit d8ee28b96ee7 · 2020-10-06T13:24:27.000Z
This revision adds init_tensors support to buffer allocation for Linalg on tensors.
Currently makes the assumption that the init_tensors fold onto the first output tensors.

This assumption is not currently enforced or cast in stone and requires experimenting with tiling linalg on tensors for ops **without reductions**.

Still this allows progress towards the end-to-end goal.
diff --git a/mlir/lib/Dialect/Linalg/IR/LinalgOps.cpp b/mlir/lib/Dialect/Linalg/IR/LinalgOps.cpp
@@ -374,7 +374,6 @@ LogicalResult BlockArgsVerifier<IndexedGenericOp>::verify(IndexedGenericOp op,
 
 template <typename GenericOpType>
 static LogicalResult verifyGenericOp(GenericOpType op) {
-  auto nInputViews = op.getNumInputs();
   auto nLoops = op.getNumLoops();
 
   if (op.inputs().size() + op.output_buffers().size() +
@@ -410,8 +409,7 @@ static LogicalResult verifyGenericOp(GenericOpType op) {
     auto idx = en.index();
     auto m = en.value().template cast<AffineMapAttr>().getValue();
     indexingMaps.push_back(m); // Save reference to map for further checks.
-    auto view = (idx < nInputViews) ? op.getInputShapedType(idx)
-                                    : op.getOutputShapedType(idx - nInputViews);
+    auto view = op.getShapedType(idx);
 
     if (m.getNumSymbols() != expectedNumSymbols)
       return op.emitOpError("expected the number of symbols in indexing_map #")
diff --git a/mlir/lib/Dialect/Linalg/Transforms/TensorsToBuffers.cpp b/mlir/lib/Dialect/Linalg/Transforms/TensorsToBuffers.cpp
@@ -39,32 +39,50 @@ class GenericOpConverter
     linalg::GenericOpAdaptor adaptor(operands,
                                      op.getOperation()->getAttrDictionary());
 
-    // TODO: support ops with reduction.
-    if (!op.init_tensors().empty())
-      return failure();
-
     // All inputs need to be turned into buffers first. Until then, bail out.
     if (llvm::any_of(adaptor.inputs(),
                      [](Value in) { return !in.getType().isa<MemRefType>(); }))
       return failure();
 
+    // All init_tensors need to be turned into buffers first. Until then, bail
+    // out.
+    if (llvm::any_of(adaptor.init_tensors(),
+                     [](Value in) { return !in.getType().isa<MemRefType>(); }))
+      return failure();
+
     Location loc = op.getLoc();
-    SmallVector<Value, 2> outputBuffers, newOutputBuffers;
-    outputBuffers.assign(adaptor.output_buffers().begin(),
-                         adaptor.output_buffers().end());
+    SmallVector<Value, 2> newOutputBuffers;
     newOutputBuffers.reserve(op.getNumOutputs());
     newOutputBuffers.append(adaptor.output_buffers().begin(),
                             adaptor.output_buffers().end());
 
     // Update all types to memref types.
-    for (Type t : op.getResultTypes()) {
-      auto type = t.cast<ShapedType>();
+    // Assume the init tensors fold onto the first results.
+    // TODO: update this assumption because the reality is more complex under
+    // linalg on tensor based transformations.
+    for (auto en : llvm::enumerate(op.getResultTypes())) {
+      auto type = en.value().cast<ShapedType>();
       if (!type.hasStaticShape())
         return rewriter.notifyMatchFailure(
             op, "dynamic shapes not currently supported");
       auto memrefType = MemRefType::get(type.getShape(), type.getElementType());
-      auto alloc = rewriter.create<AllocOp>(loc, memrefType);
-      newOutputBuffers.push_back(alloc);
+      bool foldedInitTensor = en.index() < op.getNumInitTensors();
+      if (foldedInitTensor) {
+        // Dealing with an init tensor requires distinguishing between 1-use
+        // and many-use cases which would create aliasing and WAR hazards.
+        Value initTensor = op.getInitTensor(en.index());
+        Value initBuffer = adaptor.init_tensors()[en.index()];
+        if (initTensor.hasOneUse()) {
+          newOutputBuffers.push_back(initBuffer);
+          continue;
+        }
+        auto alloc = rewriter.create<AllocOp>(loc, memrefType);
+        rewriter.create<linalg::CopyOp>(loc, initBuffer, alloc);
+        newOutputBuffers.push_back(alloc);
+      } else {
+        auto alloc = rewriter.create<AllocOp>(loc, memrefType);
+        newOutputBuffers.push_back(alloc);
+      }
     }
 
     // Generate a new linalg operation that works on buffers.
@@ -82,8 +100,12 @@ class GenericOpConverter
     Block *newBlock = rewriter.createBlock(&newRegion, newRegion.begin(),
                                            oldBlock.getArgumentTypes());
 
-    // Add the result arguments to the new block.
-    for (Value v : newOutputBuffers)
+    // Add the result arguments that do not come from init_tensors to the new
+    // block.
+    // TODO: update this assumption because the reality is more complex under
+    // linalg on tensor based transformations.
+    for (Value v :
+         ValueRange(newOutputBuffers).drop_front(adaptor.init_tensors().size()))
       newBlock->addArgument(v.getType().cast<MemRefType>().getElementType());
 
     // Clone the body of the old block to the new block.
diff --git a/mlir/test/Transforms/buffer-placement-preparation.mlir b/mlir/test/Transforms/buffer-placement-preparation.mlir
@@ -382,3 +382,141 @@ func @decompose_tuple_typed_function_args_and_results(%arg0: tuple<i1,f32>, %arg
 // CHECK-NEXT: linalg.copy(%[[SECOND_TUPLE_SECOND_ELEM]], %[[RESULT0]])
 // CHECK-NEXT: linalg.copy(%[[ARG2]], %[[RESULT1]])
 // CHECK-NEXT: return %[[SECOND_TUPLE_FIRST_ELEM]], %[[FIRST_TUPLE_FIRST_ELEM]], %[[FIRST_TUPLE_SECOND_ELEM]]
+
+// -----
+
+#accesses = [
+  affine_map<(i, j, k) -> (j, i, k)>,
+  affine_map<(i, j, k) -> (i, j)>
+]
+
+#trait = {
+  indexing_maps = #accesses,
+  iterator_types = ["parallel", "parallel", "reduction"]
+}
+
+func @generic_with_init_tensor(
+  %arg0: tensor<2x3x4xvector<3x4xi4>>, %arg1: tensor<3x2xf32>) -> (tensor<3x2xf32>) {
+
+  %0 = linalg.generic #trait
+    ins(%arg0 : tensor<2x3x4xvector<3x4xi4>>)
+   init(%arg1 : tensor<3x2xf32>) {
+    ^bb(%v0: vector<3x4xi4>, %v1: f32) :
+      %f0 = constant 0.0 : f32
+      linalg.yield %f0 : f32
+  } -> tensor<3x2xf32>
+
+  return %0 : tensor<3x2xf32>
+}
+// CHECK-LABEL: func @generic_with_init_tensor
+//  CHECK-SAME: (%[[ARG0:.*]]: memref<2x3x4xvector<3x4xi4>>, %[[ARG1:.*]]: memref<3x2xf32>, %[[RESULT0:.*]]: memref<3x2xf32>) {
+//  CHECK-NEXT: linalg.generic
+//       CHECK: linalg.copy(%[[ARG1]], %[[RESULT0]])
+//  CHECK-NEXT: return
+//   CHECK-NOT: %
+
+// -----
+
+#accesses = [
+  affine_map<(i, j, k) -> (j, i, k)>,
+  affine_map<(i, j, k) -> (i, j)>
+]
+
+#trait = {
+  indexing_maps = #accesses,
+  iterator_types = ["parallel", "parallel", "reduction"]
+}
+
+func @init_tensor_with_2_uses(
+  %arg0: tensor<2x3x4xvector<3x4xi4>>, %arg1: tensor<3x2xf32>) -> (tensor<3x2xf32>, tensor<3x2xf32>) {
+
+  %0 = linalg.generic #trait
+    ins(%arg0 : tensor<2x3x4xvector<3x4xi4>>)
+   init(%arg1 : tensor<3x2xf32>) {
+    ^bb(%v0: vector<3x4xi4>, %v1: f32) :
+      %f0 = constant 0.0 : f32
+      linalg.yield %f0 : f32
+  } -> tensor<3x2xf32>
+
+  %1 = linalg.generic #trait
+    ins(%arg0 : tensor<2x3x4xvector<3x4xi4>>)
+   init(%arg1 : tensor<3x2xf32>) {
+    ^bb(%v0: vector<3x4xi4>, %v1: f32) :
+      %f0 = constant 0.0 : f32
+      linalg.yield %f0 : f32
+  } -> tensor<3x2xf32>
+
+  return %0, %1 : tensor<3x2xf32>, tensor<3x2xf32>
+}
+// CHECK-LABEL: func @init_tensor_with_2_uses
+//  CHECK-SAME: (%[[ARG0:.*]]: memref<2x3x4xvector<3x4xi4>>, %[[ARG1:.*]]: memref<3x2xf32>, %[[RESULT0:.*]]: memref<3x2xf32>, %[[RESULT1:.*]]: memref<3x2xf32>) {
+//  CHECK-NEXT: %[[ALLOC0:.*]] = alloc
+//  CHECK-NEXT: linalg.copy(%[[ARG1]], %[[ALLOC0]])
+//  CHECK-NEXT: linalg.generic
+//  CHECK-SAME: outs(%[[ALLOC0]]
+//  CHECK-NEXT: ^bb
+//  CHECK-NEXT:   constant
+//  CHECK-NEXT:   yield
+//  CHECK-NEXT: }
+//  CHECK-NEXT: %[[ALLOC1:.*]] = alloc
+//  CHECK-NEXT: linalg.copy(%[[ARG1]], %[[ALLOC1]])
+//  CHECK-NEXT: linalg.generic
+//  CHECK-SAME: outs(%[[ALLOC1]]
+//  CHECK-NEXT: ^bb
+//  CHECK-NEXT:   constant
+//  CHECK-NEXT:   yield
+//  CHECK-NEXT: }
+//  CHECK-NEXT: linalg.copy(%[[ALLOC0]], %[[RESULT0]])
+//  CHECK-NEXT: linalg.copy(%[[ALLOC1]], %[[RESULT1]])
+//  CHECK-NEXT: return
+//   CHECK-NOT: %
+
+// -----
+
+#accesses = [
+  affine_map<(i, j, k) -> (j, i, k)>,
+  affine_map<(i, j, k) -> (i, j)>
+]
+
+#trait = {
+  indexing_maps = #accesses,
+  iterator_types = ["parallel", "parallel", "reduction"]
+}
+
+func @init_tensor_with_1_use_def_chain(
+  %arg0: tensor<2x3x4xvector<3x4xi4>>, %arg1: tensor<3x2xf32>) -> (tensor<3x2xf32>) {
+
+  %0 = linalg.generic #trait
+    ins(%arg0 : tensor<2x3x4xvector<3x4xi4>>)
+   init(%arg1 : tensor<3x2xf32>) {
+    ^bb(%v0: vector<3x4xi4>, %v1: f32) :
+      %f0 = constant 0.0 : f32
+      linalg.yield %f0 : f32
+  } -> tensor<3x2xf32>
+
+  %1 = linalg.generic #trait
+    ins(%arg0 : tensor<2x3x4xvector<3x4xi4>>)
+   init(%0 : tensor<3x2xf32>) {
+    ^bb(%v0: vector<3x4xi4>, %v1: f32) :
+      %f0 = constant 0.0 : f32
+      linalg.yield %f0 : f32
+  } -> tensor<3x2xf32>
+
+  return %1 : tensor<3x2xf32>
+}
+// CHECK-LABEL: func @init_tensor_with_1_use_def_chain
+//  CHECK-SAME: (%[[ARG0:.*]]: memref<2x3x4xvector<3x4xi4>>, %[[ARG1:.*]]: memref<3x2xf32>, %[[RESULT0:.*]]: memref<3x2xf32>) {
+//  CHECK-NEXT: linalg.generic
+//  CHECK-NEXT: ^bb
+//  CHECK-NEXT:   constant
+//  CHECK-NEXT:   yield
+//  CHECK-NEXT: }
+//  CHECK-NEXT: linalg.generic
+//  CHECK-NEXT: ^bb
+//  CHECK-NEXT:   constant
+//  CHECK-NEXT:   yield
+//  CHECK-NEXT: }
+//  CHECK-NEXT: linalg.copy(%[[ARG1]], %[[RESULT0]])
+//  CHECK-NEXT: return
+//   CHECK-NOT: %
+
diff --git a/mlir/test/lib/Transforms/TestBufferPlacement.cpp b/mlir/test/lib/Transforms/TestBufferPlacement.cpp
@@ -56,34 +56,53 @@ struct TestBufferPlacementPreparationPass
       linalg::GenericOpAdaptor adaptor(operands,
                                        op.getOperation()->getAttrDictionary());
 
-      // TODO: support ops with reduction.
-      if (!op.init_tensors().empty())
-        return failure();
-
       // All inputs need to be turned into buffers first. Until then, bail out.
       if (llvm::any_of(adaptor.inputs(), [](Value in) {
             return !in.getType().isa<MemRefType>();
           }))
         return failure();
 
+      // All init_tensors need to be turned into buffers first. Until then, bail
+      // out.
+      if (llvm::any_of(adaptor.init_tensors(), [](Value in) {
+            return !in.getType().isa<MemRefType>();
+          }))
+        return failure();
+
       Location loc = op.getLoc();
-      SmallVector<Value, 2> outputBuffers, newOutputBuffers;
-      outputBuffers.assign(adaptor.output_buffers().begin(),
-                           adaptor.output_buffers().end());
+      SmallVector<Value, 2> newOutputBuffers;
       newOutputBuffers.reserve(op.getNumOutputs());
       newOutputBuffers.append(adaptor.output_buffers().begin(),
                               adaptor.output_buffers().end());
 
       // Update all types to memref types.
-      for (Type t : op.getResultTypes()) {
-        auto type = t.cast<ShapedType>();
+      // Assume the init tensors fold onto the first results.
+      // TODO: update this assumption because the reality is more complex under
+      // linalg on tensor based transformations.
+      for (auto en : llvm::enumerate(op.getResultTypes())) {
+        auto type = en.value().cast<ShapedType>();
         if (!type.hasStaticShape())
           return rewriter.notifyMatchFailure(
               op, "dynamic shapes not currently supported");
         auto memrefType =
             MemRefType::get(type.getShape(), type.getElementType());
-        auto alloc = rewriter.create<AllocOp>(loc, memrefType);
-        newOutputBuffers.push_back(alloc);
+        bool foldedInitTensor = en.index() < op.getNumInitTensors();
+        if (foldedInitTensor) {
+          // Dealing with an init tensor requires distinguishing between 1-use
+          // and many-use cases which would create aliasing and WAR hazards.
+          Value initTensor = op.getInitTensor(en.index());
+          Value initBuffer = adaptor.init_tensors()[en.index()];
+          if (initTensor.hasOneUse()) {
+            newOutputBuffers.push_back(initBuffer);
+            continue;
+          }
+          auto alloc = rewriter.create<AllocOp>(loc, memrefType);
+          rewriter.create<linalg::CopyOp>(loc, initBuffer, alloc);
+          newOutputBuffers.push_back(alloc);
+        } else {
+          auto alloc = rewriter.create<AllocOp>(loc, memrefType);
+          newOutputBuffers.push_back(alloc);
+        }
       }
 
       // Generate a new linalg operation that works on buffers.
@@ -101,8 +120,12 @@ struct TestBufferPlacementPreparationPass
       Block *newBlock = rewriter.createBlock(&newRegion, newRegion.begin(),
                                              oldBlock.getArgumentTypes());
 
-      // Add the result arguments to the new block.
-      for (Value v : newOutputBuffers)
+      // Add the result arguments that do not come from init_tensors to the new
+      // block.
+      // TODO: update this assumption because the reality is more complex under
+      // linalg on tensor based transformations.
+      for (Value v : ValueRange(newOutputBuffers)
+                         .drop_front(adaptor.init_tensors().size()))
         newBlock->addArgument(v.getType().cast<MemRefType>().getElementType());
 
       // Clone the body of the old block to the new block.