intel · ciyongch · Sep 4, 2024 · Aug 27, 2024 · Aug 27, 2024 · Aug 27, 2024
diff --git a/lib/gc/Transforms/MemRefToCPURuntime.cpp b/lib/gc/Transforms/MemRefToCPURuntime.cpp
@@ -19,7 +19,6 @@
 #include "mlir/Support/LogicalResult.h"
 #include "mlir/Transforms/DialectConversion.h"
 #include "mlir/Transforms/GreedyPatternRewriteDriver.h"
-#include "llvm/ADT/SmallSet.h"
 
 namespace mlir {
 namespace gc {
@@ -38,9 +37,10 @@ bool hasParallelParent(Operation *op) {
   }
   return false;
 }
-struct AlignedAllocLowering : public OpRewritePattern<memref::AllocOp> {
-  using OpRewritePattern<memref::AllocOp>::OpRewritePattern;
-  LogicalResult matchAndRewrite(memref::AllocOp op,
+
+struct AlignedAllocLowering : public OpRewritePattern<memref::AllocaOp> {
+  using OpRewritePattern<memref::AllocaOp>::OpRewritePattern;
+  LogicalResult matchAndRewrite(memref::AllocaOp op,
                                 PatternRewriter &rewriter) const final {
     auto loc = op->getLoc();
     MemRefType type = op.getMemref().getType();
@@ -54,48 +54,49 @@ struct AlignedAllocLowering : public OpRewritePattern<memref::AllocOp> {
     return success();
   }
 };
-
-struct AlignedDeallocLowering : public OpRewritePattern<memref::DeallocOp> {
-  using OpRewritePattern<memref::DeallocOp>::OpRewritePattern;
-  LogicalResult matchAndRewrite(memref::DeallocOp op,
-                                PatternRewriter &rewriter) const final {
-    auto loc = op->getLoc();
-    Value memref = op.getMemref();
-    cpuruntime::DeallocOp newDeallocOp =
-        rewriter.create<cpuruntime::DeallocOp>(loc, memref);
-    if (hasParallelParent(op))
-      newDeallocOp.setThreadLocal(true);
-    rewriter.eraseOp(op);
-    return success();
-  }
-};
-
 struct ConvertMemRefToCPURuntime
     : public impl::ConvertMemRefToCPURuntimeBase<ConvertMemRefToCPURuntime> {
 
   void runOnOperation() final {
     auto *ctx = &getContext();
-    // Create a local set to store operations that should not be transformed.
-    llvm::SmallSet<Operation *, 16> noTransformOps;
 
-    // Walk through the module to find func::FuncOp instances.
+    // Create deallocOp accoresponding to the alloca's localtion
     getOperation()->walk([&](func::FuncOp funcOp) {
-      BufferViewFlowAnalysis analysis(funcOp);
-      // Now walk through the operations within the func::FuncOp.
-      funcOp.walk([&](Operation *op) {
-        if (op->hasTrait<OpTrait::ReturnLike>()) {
-          for (Value operand : op->getOperands()) {
-            if (isa<MemRefType>(operand.getType())) {
-              auto aliases = analysis.resolveReverse(operand);
-              // Check if any of the returned memref is allocated within scope.
-              for (auto &&alias : aliases) {
-                if (Operation *allocOp =
-                        alias.getDefiningOp<memref::AllocOp>()) {
-                  noTransformOps.insert(allocOp);
-                }
-              }
+      funcOp.walk([&](memref::AllocaOp op) {
+        Region *parentRegion = op->getParentRegion();
+        OpBuilder builder(op);
+        // Find the first deallocOp in the current region
+        cpuruntime::DeallocOp firstDeallocOp;
+        for (Block &block : parentRegion->getBlocks()) {
+          for (Operation &operation : block) {
+            if (auto deallocOp = dyn_cast<cpuruntime::DeallocOp>(&operation)) {
+              firstDeallocOp = deallocOp;
+              break;
             }
           }
+          if (firstDeallocOp)
+            break;
+        }
+
+        // If a deallocOp was found, insert the new dealloc before it
+        if (firstDeallocOp) {
+          builder.setInsertionPoint(firstDeallocOp);
+        } else {
+          // If no deallocOp was found, insert at the end of the region before
+          // the terminator
+          Block &lastBlock = parentRegion->back();
+          builder.setInsertionPointToEnd(&lastBlock);
+          if (!lastBlock.empty() &&
+              lastBlock.back().hasTrait<OpTrait::IsTerminator>()) {
+            builder.setInsertionPoint(&lastBlock.back());
+          }
+        }
+
+        // Create the dealloc operation
+        auto deallocOp =
+            builder.create<cpuruntime::DeallocOp>(op.getLoc(), op.getResult());
+        if (hasParallelParent(op)) {
+          deallocOp.setThreadLocal(true);
         }
       });
     });
@@ -104,16 +105,10 @@ struct ConvertMemRefToCPURuntime
     ConversionTarget target(getContext());
     // Make all operations legal by default.
     target.markUnknownOpDynamicallyLegal([](Operation *op) { return true; });
-    target.addDynamicallyLegalOp<memref::AllocOp, memref::DeallocOp>(
-        [&](Operation *op) {
-          // Return true if the operation is in the noTransformOps set, making
-          // it dynamically legal.
-          return noTransformOps.find(op) != noTransformOps.end();
-        });
+    target.addIllegalOp<memref::AllocaOp>();
     // set pattern
     RewritePatternSet patterns(ctx);
     patterns.add<AlignedAllocLowering>(ctx);
-    patterns.add<AlignedDeallocLowering>(ctx);
     // perform conversion
     if (failed(
             applyFullConversion(getOperation(), target, std::move(patterns)))) {

diff --git a/lib/gc/Transforms/Pipeline.cpp b/lib/gc/Transforms/Pipeline.cpp
@@ -24,6 +24,7 @@
 #include "mlir/Pass/PassManager.h"
 #include "mlir/Support/LogicalResult.h"
 #include "mlir/Transforms/Passes.h"
+#include <climits>
 
 #include "gc/Dialect/CPURuntime/Transforms/CPURuntimePasses.h"
 #include "gc/Dialect/Linalgx/LinalgxDialect.h"
@@ -110,9 +111,10 @@ void populateBufferizationPasses(mlir::OpPassManager &pm) {
   opt.hoistStaticAllocs = true;
   pm.addPass(bufferization::createBufferResultsToOutParamsPass(opt));
   pm.addPass(bufferization::createDropEquivalentBufferResultsPass());
-  pm.addNestedPass<func::FuncOp>(
-      bufferization::createPromoteBuffersToStackPass());
-  bufferization::BufferDeallocationPipelineOptions deallocOption;
+  pm.addNestedPass<func::FuncOp>(bufferization::createPromoteBuffersToStackPass(
+      /*maxAllocSizeInBytes*/ UINT_MAX,
+      /*maxRankOfAllocatedMemRef*/ 8));
+  mlir::bufferization::BufferDeallocationPipelineOptions deallocOption;
   bufferization::buildBufferDeallocationPipeline(pm, deallocOption);
   pm.addPass(createBufferizationToMemRefPass());
   populateCleanUpPasses(pm);

diff --git a/test/mlir/test/gc/Dialect/CPURuntime/memref-to-cpuruntime.mlir b/test/mlir/test/gc/Dialect/CPURuntime/memref-to-cpuruntime.mlir
@@ -1,68 +1,52 @@
 // RUN: gc-opt --split-input-file --convert-memref-to-cpuruntime %s -verify-diagnostics | FileCheck %s
-func.func @alloc() {
-  // CHECK-LABEL: func @alloc()
+
+func.func @alloca() {
+  // CHECK-LABEL: func @alloca()
   // CHECK: %[[m0:.*]] = cpuruntime.alloc() : memref<1024xf32>
-  %m0 = memref.alloc() : memref<1024xf32>
+  %m0 = memref.alloca() : memref<1024xf32>
   scf.forall (%i) in (32) {
   }
   // CHECK: cpuruntime.dealloc %[[m0]] : memref<1024xf32>
-  cpuruntime.dealloc %m0 : memref<1024xf32>
   return
 }
 
-func.func @thread_alloc() {
-  // CHECK-LABEL: func.func @thread_alloc()
+func.func @thread_alloca() {
+  // CHECK-LABEL: func.func @thread_alloca()
   // CHECK: %[[m0:.*]] = cpuruntime.alloc thread_local() : memref<1024xf32>
   scf.forall (%i) in (32) {
-    %0 = memref.alloc() : memref<1024xf32>
+    %0 = memref.alloca() : memref<1024xf32>
     // CHECK: cpuruntime.dealloc thread_local %[[m0]] : memref<1024xf32>
-    memref.dealloc %0 : memref<1024xf32>
   }
   return
 }
 
-func.func @return_alloc() -> memref<32x18xf32> {
-  // CHECK-LABEL: func @return_alloc() -> memref<32x18xf32>
-  // CHECK: %[[m0:.*]] = memref.alloc() : memref<32x18xf32>
-  %0 = memref.alloc() : memref<32x18xf32>
-  return %0 : memref<32x18xf32>
+func.func @dynamic_ranked_alloca(%arg0: memref<*xf32>) {
+  // CHECK-LABEL: func @dynamic_ranked_alloca(%arg0: memref<*xf32>)
+  // CHECK: %[[RANK:.*]] = memref.rank %{{.*}} : memref<*xf32>
+  // CHECK: %[[m0:.*]] = cpuruntime.alloc(%[[RANK]]) : memref<?xindex>
+  // CHECK: cpuruntime.dealloc %[[m0]] : memref<?xindex>
+  %0 = memref.rank %arg0 : memref<*xf32>
+  %alloca = memref.alloca(%0) : memref<?xindex>
+  return
 }
 
-func.func @yield_alloc() -> memref<32x18xf32> {
-  // CHECK-LABEL: func @yield_alloc() -> memref<32x18xf32>
-  // CHECK: %[[m0:.*]] = memref.alloc() : memref<32x18xf32>
-  %c32 = arith.constant 32 : index
-  %c1 = arith.constant 1 : index
-  %c0 = arith.constant 0 : index
-  %lastBuffer = memref.alloc() : memref<32x18xf32>
-  scf.for %arg3 = %c0 to %c32 step %c1 iter_args(%arg1 = %lastBuffer) -> (memref<32x18xf32>) {
-    %newBuffer = memref.alloc() : memref<32x18xf32>
-    memref.dealloc %arg1 : memref<32x18xf32>
-    scf.yield %newBuffer : memref<32x18xf32>
+func.func @loop_nested_if_alloca(%arg0: index, %arg1: index, %arg2: index, %arg3: memref<2xf32>) {
+  // CHECK-LABEL: func @loop_nested_if_alloca(%arg0: index, %arg1: index, %arg2: index, %arg3: memref<2xf32>)
+  // CHECK: %[[m0:.*]] = cpuruntime.alloc() : memref<2xf32>
+  %alloca = memref.alloca() : memref<2xf32>
+  %0 = scf.for %arg5 = %arg0 to %arg1 step %arg2 iter_args(%arg6 = %arg3) -> (memref<2xf32>) {
+    %1 = arith.cmpi eq, %arg5, %arg1 : index
+    %2 = scf.if %1 -> (memref<2xf32>) {
+      // CHECK: yield %[[m0]] : memref<2xf32>
+      scf.yield %alloca : memref<2xf32>
+    } else {
+      // CHECK: %[[m1:.*]] = cpuruntime.alloc() : memref<2xf32>
+      // CHECK: cpuruntime.dealloc %[[m1]] : memref<2xf32>
+      %alloca_0 = memref.alloca() : memref<2xf32>
+      scf.yield %arg6 : memref<2xf32>
+    }
+    scf.yield %2 : memref<2xf32>
   }
-  return %lastBuffer : memref<32x18xf32>
-}
-
-func.func @return_view_alloc() -> memref<16xf32> {
-  // CHECK-LABEL: func @return_view_alloc() -> memref<16xf32>
-  // CHECK: %[[m0:.*]] = memref.alloc() : memref<128xi8>
-  %c0 = arith.constant 0: index
-  %f0 = arith.constant 0.0: f32
-  %alloc = memref.alloc() : memref<128xi8>
-  %view = memref.view %alloc[%c0][] : memref<128xi8> to memref<32xf32>
-  %subview = memref.subview %view[0][16][1] : memref<32xf32> to memref<16xf32>
-  return %subview : memref<16xf32>
-}
-
-func.func @alloc_dealloc_view() {
-  // CHECK-LABEL: func @alloc_dealloc_view()
-  // CHECK: %[[m0:.*]] = cpuruntime.alloc() : memref<128xi8>
-  %c0 = arith.constant 0: index
-  %f0 = arith.constant 0.0: f32
-  %alloc = memref.alloc() : memref<128xi8>
-  %view = memref.view %alloc[%c0][] : memref<128xi8> to memref<32xf32>
-  %subview = memref.subview %view[0][16][1] : memref<32xf32> to memref<16xf32>
-  // CHECK: cpuruntime.dealloc
-  memref.dealloc %subview : memref<16xf32>
+  // CHECK: cpuruntime.dealloc %[[m0]] : memref<2xf32>
   return
 }