Add more/better tests for ToTensorOp creation in SCF op bufferizations

christopherbate · christopherbate · commit 22eeae8e6921 · 2024-11-26T01:47:17.000Z
diff --git a/mlir/lib/Dialect/SCF/Transforms/BufferizableOpInterfaceImpl.cpp b/mlir/lib/Dialect/SCF/Transforms/BufferizableOpInterfaceImpl.cpp
@@ -486,15 +486,17 @@ getBuffers(RewriterBase &rewriter, const MutableOperandRange &operands,
 /// ToTensorOps, so that the block body can be moved over to the new op.
 static SmallVector<Value>
 getBbArgReplacements(RewriterBase &rewriter, Block::BlockArgListType bbArgs,
+                     Block::BlockArgListType oldBbArgs,
                      const DenseSet<int64_t> &tensorIndices) {
   SmallVector<Value> result;
   for (const auto &it : llvm::enumerate(bbArgs)) {
     size_t idx = it.index();
     Value val = it.value();
     if (tensorIndices.contains(idx)) {
-      result.push_back(
-          rewriter.create<bufferization::ToTensorOp>(val.getLoc(), val)
-              .getResult());
+      result.push_back(rewriter
+                           .create<bufferization::ToTensorOp>(
+                               val.getLoc(), oldBbArgs[idx].getType(), val)
+                           .getResult());
     } else {
       result.push_back(val);
     }
@@ -764,7 +766,8 @@ struct ForOpInterface
     // iter_args of the new loop in ToTensorOps.
     rewriter.setInsertionPointToStart(loopBody);
     SmallVector<Value> iterArgs =
-        getBbArgReplacements(rewriter, newForOp.getRegionIterArgs(), indices);
+        getBbArgReplacements(rewriter, newForOp.getRegionIterArgs(),
+                             forOp.getRegionIterArgs(), indices);
     iterArgs.insert(iterArgs.begin(), newForOp.getInductionVar());
 
     // Move loop body to new loop.
@@ -1001,16 +1004,18 @@ struct WhileOpInterface
     // The old block uses tensors, so wrap the (memref) bbArgs of the new block
     // in ToTensorOps.
     rewriter.setInsertionPointToStart(newBeforeBody);
-    SmallVector<Value> newBeforeArgs = getBbArgReplacements(
-        rewriter, newWhileOp.getBeforeArguments(), indicesBefore);
+    SmallVector<Value> newBeforeArgs =
+        getBbArgReplacements(rewriter, newWhileOp.getBeforeArguments(),
+                             whileOp.getBeforeArguments(), indicesBefore);
     rewriter.mergeBlocks(whileOp.getBeforeBody(), newBeforeBody, newBeforeArgs);
 
     // Set up new iter_args and move the loop body block to the new op.
     // The old block uses tensors, so wrap the (memref) bbArgs of the new block
     // in ToTensorOps.
     rewriter.setInsertionPointToStart(newAfterBody);
-    SmallVector<Value> newAfterArgs = getBbArgReplacements(
-        rewriter, newWhileOp.getAfterArguments(), indicesAfter);
+    SmallVector<Value> newAfterArgs =
+        getBbArgReplacements(rewriter, newWhileOp.getAfterArguments(),
+                             whileOp.getAfterArguments(), indicesAfter);
     rewriter.mergeBlocks(whileOp.getAfterBody(), newAfterBody, newAfterArgs);
 
     // Replace loop results.
@@ -1256,8 +1261,8 @@ struct ForallOpInterface
              forallOp.getBody()->getArguments().drop_front(rank), buffers)) {
       BlockArgument bbArg = std::get<0>(it);
       Value buffer = std::get<1>(it);
-      Value bufferAsTensor =
-          rewriter.create<ToTensorOp>(forallOp.getLoc(), buffer);
+      Value bufferAsTensor = rewriter.create<ToTensorOp>(
+          forallOp.getLoc(), bbArg.getType(), buffer);
       bbArg.replaceAllUsesWith(bufferAsTensor);
     }
 
diff --git a/mlir/test/Dialect/Bufferization/Transforms/one-shot-bufferize-encodings.mlir b/mlir/test/Dialect/Bufferization/Transforms/one-shot-bufferize-encodings.mlir
@@ -1,27 +1,5 @@
 // RUN: mlir-opt %s -one-shot-bufferize="use-encoding-for-memory-space" -split-input-file | FileCheck %s
 
-// TODO: move to tensor dialect tests
-func.func @from_elements(%fill: f32, %f: f32, %idx: index) -> tensor<3xf32, 1> {
-  %t = tensor.from_elements %fill, %fill, %fill : tensor<3xf32, 1>
-  %i = tensor.insert %f into %t[%idx] : tensor<3xf32, 1>
-  return %i : tensor<3xf32, 1>
-}
-
-// CHECK-LABEL: @from_elements
-//  CHECK-SAME: (%[[arg0:.+]]: f32, %[[arg1:.+]]: f32, %[[arg2:.+]]: index) -> tensor<3xf32, 1 : i64>
-//       CHECK:     %[[alloc:.+]] = memref.alloc() {{.*}} : memref<3xf32, 1>
-//       CHECK-DAG:     %[[c0:.+]] = arith.constant 0 : index
-//       CHECK-DAG:     %[[c1:.+]] = arith.constant 1 : index
-//       CHECK-DAG:     %[[c2:.+]] = arith.constant 2 : index
-//       CHECK:     memref.store %[[arg0]], %[[alloc]][%[[c0]]] : memref<3xf32, 1>
-//       CHECK:     memref.store %[[arg0]], %[[alloc]][%[[c1]]] : memref<3xf32, 1>
-//       CHECK:     memref.store %[[arg0]], %[[alloc]][%[[c2]]] : memref<3xf32, 1>
-//       CHECK:     memref.store %[[arg1]], %[[alloc]][%[[arg2]]] : memref<3xf32, 1>
-//       CHECK:     %[[v0:.+]] = bufferization.to_tensor %[[alloc]] : memref<3xf32, 1> -> tensor<3xf32, 1 : i64>
-//       CHECK:     return %[[v0]] : tensor<3xf32, 1 : i64>
-
-// -----
-
 func.func @alloc_tesor_with_space_no_encoding() -> tensor<128xf32> {
   %0 = bufferization.alloc_tensor() {memory_space = 1 : i64} : tensor<128xf32>
   return %0 : tensor<128xf32>
@@ -131,22 +109,3 @@ func.func @materialize_in_destination(%arg0: tensor<128xf32, 1>) -> tensor<128xf
 //       CHECK:     memref.copy %[[v0]], %[[alloc]] : memref<128xf32, strided<[?], offset: ?>, 1> to memref<128xf32, 2>
 //       CHECK:     %[[v1:.+]] = bufferization.to_tensor %[[alloc]] : memref<128xf32, 2> -> tensor<128xf32, 2 : i64>
 //       CHECK:     return %[[v1]] : tensor<128xf32, 2 : i64>
-
-// -----
-
-func.func @scf_for_iter_arg(%arg0: tensor<128xf32, 1>, %arg1: index, %arg2: index, %arg3: index, %arg4: f32) -> tensor<128xf32, 1> {
-  %0 = scf.for %i = %arg1 to %arg2 step %arg3 iter_args(%iter = %arg0) -> tensor<128xf32, 1> {
-    %0 = tensor.insert %arg4 into %iter[%i] : tensor<128xf32, 1>
-    scf.yield %0 : tensor<128xf32, 1>
-  }
-  return %0 : tensor<128xf32, 1>
-}
-
-// -----
-
-func.func @scf_execute_region(%arg0: tensor<128xf32, 1>) -> tensor<128xf32, 1> {
-  %0 = scf.execute_region -> tensor<128xf32, 1> {
-    scf.yield %arg0 : tensor<128xf32, 1>
-  }
-  return %0 : tensor<128xf32, 1>
-}
diff --git a/mlir/test/Dialect/SCF/one-shot-bufferize-encodings.mlir b/mlir/test/Dialect/SCF/one-shot-bufferize-encodings.mlir
@@ -0,0 +1,73 @@
+// RUN: mlir-opt %s -one-shot-bufferize="use-encoding-for-memory-space allow-return-allocs-from-loops allow-unknown-ops" -allow-unregistered-dialect -split-input-file | FileCheck %s
+
+// Here and below, unknown op 'some.use' will force 'bufferization.to_tensor' operations to remain in the body,
+// allowing us to check that the encoding on the '%iter' tensor is correctly preserved.
+
+func.func @scf_for_iter_arg(%arg0: tensor<128xf32, 1>, %arg1: index, %arg2: index, %arg3: index) -> tensor<128xf32, 1> {
+  %0 = scf.for %i = %arg1 to %arg2 step %arg3 iter_args(%iter = %arg0) -> tensor<128xf32, 1> {
+    %0 = "some.use"(%iter) : (tensor<128xf32, 1>) -> tensor<128xf32, 1>
+    scf.yield %0 : tensor<128xf32, 1>
+  }
+  return %0 : tensor<128xf32, 1>
+}
+
+// CHECK-LABEL: func.func @scf_for_iter_arg
+//  CHECK-SAME: (%[[arg0:.+]]: tensor<128xf32, 1 : i64>, %[[arg1:.+]]: index, %[[arg2:.+]]: index, %[[arg3:.+]]: index)
+//       CHECK:     %[[v0:.+]] = bufferization.to_memref %[[arg0]] : tensor<128xf32, 1 : i64> -> memref<128xf32, strided<[?], offset: ?>, 1>
+//       CHECK:     %[[alloc:.+]] = memref.alloc() {alignment = 64 : i64} : memref<128xf32, 1>
+//       CHECK:     memref.copy %[[v0]], %[[alloc]] : memref<128xf32, strided<[?], offset: ?>, 1> to memref<128xf32, 1>
+//       CHECK:     %[[cast:.+]] = memref.cast %[[alloc]] : memref<128xf32, 1> to memref<128xf32, strided<[?], offset: ?>, 1>
+//       CHECK:     %[[v1:.+]] = scf.for %{{.+}} = %[[arg1]] to %[[arg2]] step %[[arg3]] iter_args(%[[arg6:.+]] = %[[cast]]) -> (memref<128xf32, strided<[?], offset: ?>, 1>)
+//  CHECK-NEXT:       %[[v3:.+]] = bufferization.to_tensor %[[arg6]] : memref<128xf32, strided<[?], offset: ?>, 1> -> tensor<128xf32, 1 : i64>
+//  CHECK-NEXT:       %[[v4:.+]] = "some.use"(%[[v3]]) : (tensor<128xf32, 1 : i64>) -> tensor<128xf32, 1 : i64>
+//  CHECK-NEXT:       %[[v5:.+]] = bufferization.to_memref %[[v4]] : tensor<128xf32, 1 : i64> -> memref<128xf32, strided<[?], offset: ?>, 1>
+//  CHECK-NEXT:       scf.yield %[[v5]] : memref<128xf32, strided<[?], offset: ?>, 1>
+//       CHECK:     %[[v2:.+]] = bufferization.to_tensor %[[v1]] : memref<128xf32, strided<[?], offset: ?>, 1> -> tensor<128xf32, 1 : i64>
+//       CHECK:     return %[[v2]] : tensor<128xf32, 1 : i64>
+
+// -----
+
+func.func @scf_forall(
+    %idx: index,
+    %idx2: index,
+    %arg1: tensor<?xf32, 1>,
+    %arg2: tensor<?xf32, 1>) -> (tensor<?xf32, 1>) {
+  %cst = arith.constant 4.200000e+01 : f32
+  %c0 = arith.constant 0 : index
+  %c1 = arith.constant 1 : index
+  %2 = scf.forall (%arg3) in (%idx2) shared_outs(%o = %arg2) -> (tensor<?xf32, 1>) {
+      %8 = "some.use"(%o) : (tensor<?xf32, 1>) -> tensor<?xf32, 1>
+      scf.forall.in_parallel {
+        tensor.parallel_insert_slice %8 into %o[5] [%idx] [%c1] :
+          tensor<?xf32, 1> into tensor<?xf32, 1>
+      }
+  }
+  return %2 : tensor<?xf32, 1>
+}
+
+// CHECK-LABEL: func.func @scf_forall
+//       CHECK:     scf.forall
+//       CHECK:       %[[v2:.+]] = bufferization.to_tensor %{{.+}} : memref<?xf32, 1> -> tensor<?xf32, 1 : i64>
+//       CHECK:       %[[v3:.+]] = "some.use"(%[[v2]]) : (tensor<?xf32, 1 : i64>) -> tensor<?xf32, 1 : i64>
+//       CHECK:       bufferization.to_memref %[[v3]] : tensor<?xf32, 1 : i64> -> memref<?xf32, strided<[?], offset: ?>, 1>
+//       CHECK:     %[[v1:.+]] = bufferization.to_tensor %{{.+}} : memref<?xf32, 1> -> tensor<?xf32, 1 : i64>
+//       CHECK:     return %[[v1]] : tensor<?xf32, 1 : i64>
+
+// -----
+
+func.func @scf_execute_region(%arg0: tensor<128xf32, 1>) -> tensor<128xf32, 1> {
+  %0 = scf.execute_region -> tensor<128xf32, 1> {
+    scf.yield %arg0 : tensor<128xf32, 1>
+  }
+  %1 = "some.use"(%0) : (tensor<128xf32, 1>) -> tensor<128xf32, 1>
+  return %1 : tensor<128xf32, 1>
+}
+
+// CHECK-LABEL: func.func @scf_execute_region
+//  CHECK-SAME: (%[[arg0:.+]]: tensor<128xf32, 1 : i64>)
+//       CHECK:     %[[v0:.+]] = bufferization.to_memref %[[arg0]] : tensor<128xf32, 1 : i64> -> memref<128xf32, strided<[?], offset: ?>, 1>
+//       CHECK:     %[[v1:.+]] = scf.execute_region -> memref<128xf32, strided<[?], offset: ?>, 1>
+//       CHECK:       scf.yield %[[v0]] : memref<128xf32, strided<[?], offset: ?>, 1>
+//       CHECK:     %[[v2:.+]] = bufferization.to_tensor %[[v1]] : memref<128xf32, strided<[?], offset: ?>, 1> -> tensor<128xf32, 1 : i64>
+//       CHECK:     %[[v3:.+]] = "some.use"(%[[v2]]) : (tensor<128xf32, 1 : i64>) -> tensor<128xf32, 1 : i64>
+//       CHECK:     return %[[v3]] : tensor<128xf32, 1 : i64>
diff --git a/mlir/test/Dialect/Tensor/one-shot-bufferize-encodings.mlir b/mlir/test/Dialect/Tensor/one-shot-bufferize-encodings.mlir
@@ -0,0 +1,20 @@
+// RUN: mlir-opt %s -one-shot-bufferize="use-encoding-for-memory-space" -split-input-file | FileCheck %s
+
+func.func @from_elements(%fill: f32, %f: f32, %idx: index) -> tensor<3xf32, 1> {
+  %t = tensor.from_elements %fill, %fill, %fill : tensor<3xf32, 1>
+  %i = tensor.insert %f into %t[%idx] : tensor<3xf32, 1>
+  return %i : tensor<3xf32, 1>
+}
+
+// CHECK-LABEL: @from_elements
+//  CHECK-SAME: (%[[arg0:.+]]: f32, %[[arg1:.+]]: f32, %[[arg2:.+]]: index) -> tensor<3xf32, 1 : i64>
+//       CHECK:     %[[alloc:.+]] = memref.alloc() {{.*}} : memref<3xf32, 1>
+//       CHECK-DAG:     %[[c0:.+]] = arith.constant 0 : index
+//       CHECK-DAG:     %[[c1:.+]] = arith.constant 1 : index
+//       CHECK-DAG:     %[[c2:.+]] = arith.constant 2 : index
+//       CHECK:     memref.store %[[arg0]], %[[alloc]][%[[c0]]] : memref<3xf32, 1>
+//       CHECK:     memref.store %[[arg0]], %[[alloc]][%[[c1]]] : memref<3xf32, 1>
+//       CHECK:     memref.store %[[arg0]], %[[alloc]][%[[c2]]] : memref<3xf32, 1>
+//       CHECK:     memref.store %[[arg1]], %[[alloc]][%[[arg2]]] : memref<3xf32, 1>
+//       CHECK:     %[[v0:.+]] = bufferization.to_tensor %[[alloc]] : memref<3xf32, 1> -> tensor<3xf32, 1 : i64>
+//       CHECK:     return %[[v0]] : tensor<3xf32, 1 : i64>