add merge forall pass

zhczhong · zhczhong · commit 881615925c9c · 2024-07-30T19:35:59.000-07:00
diff --git a/include/gc/Transforms/Passes.td b/include/gc/Transforms/Passes.td
@@ -60,6 +60,19 @@ def DeepTileContractionNamedOp
   ];
 }
 
+def SinkOpIntoInnerLoop : Pass<"sink-op-into-inner-loop"> {
+  let summary = "Sink operations into inner loops";
+  let description = [{The pass tries to sink operations into inner loops as deep as possible to maximize the chance for outer loop optimization.
+  }];
+  let dependentDialects = [];
+}
+
+def MergeNestedForall : Pass<"merge-nested-forall"> {
+  let summary = "Merge nested scf.forall operations";
+  let description = [{The pass tries to merge nested forall operations.}];
+  let dependentDialects = ["scf::SCFDialect"];
+}
+
 def GCCPUPipeline : Pass<"gc-cpu-pipeline"> {
   let summary = "All-in-one pipeline for GC for CPU";
   let dependentDialects = [
diff --git a/lib/gc/Transforms/CMakeLists.txt b/lib/gc/Transforms/CMakeLists.txt
@@ -14,6 +14,8 @@ add_mlir_library(GCPasses
   Pipeline.cpp
   DeepTileContractionNamedOp.cpp
   Tiling.cpp
+  SinkOpIntoInnerLoop.cpp
+  MergeNestedForall.cpp
 
   ADDITIONAL_HEADER_DIRS
     ${PROJECT_SOURCE_DIR}/include
diff --git a/lib/gc/Transforms/DeepTileContractionNamedOp.cpp b/lib/gc/Transforms/DeepTileContractionNamedOp.cpp
@@ -464,30 +464,36 @@ generateOuterLoop(RewriterBase &b, linalg::LinalgOp linalgOp,
           currentOp.getNumLoops(), getAsIndexOpFoldResult(b.getContext(), 0));
       SmallVector<unsigned> reductionDims;
       currentOp.getReductionDims(reductionDims);
+      bool tileOnReduction = false;
       for (auto [d, tile] : llvm::zip(currentDim, currentTileSize)) {
+        if (llvm::find(reductionDims, d) != reductionDims.end()) {
+          tileOnReduction = true;
+        }
         if (llvm::find(reductionDims, d) != reductionDims.end() &&
-            !dyn_cast<PartialReductionOpInterface>(currentOp.getOperation()))
+            !dyn_cast<PartialReductionOpInterface>(currentOp.getOperation())) {
           tileSizes[d] = getAsIndexOpFoldResult(b.getContext(), 0);
-        else
+          tileOnReduction = false;
+        } else
           tileSizes[d] = getAsIndexOpFoldResult(b.getContext(), tile);
       }
       SmallVector<Range> loopRanges =
           cast<TilingInterface>(currentOp.getOperation()).getIterationDomain(b);
       OpBuilder::InsertionGuard guard(b);
       b.setInsertionPoint(currentOp);
-      if (auto partialInterface =
-              dyn_cast<PartialReductionOpInterface>(currentOp.getOperation())) {
+      if (tileOnReduction) {
+        auto partialInterface =
+            dyn_cast<PartialReductionOpInterface>(currentOp.getOperation());
         for (auto [idx, tile] : llvm::enumerate(tileSizes)) {
-          if (isConstantIntValue(tile, 0)) {
+          if (isConstantIntValue(tile, 0) &&
+              llvm::find(reductionDims, d) != reductionDims.end()) {
             tileSizes[idx] = loopRanges[idx].size;
           }
         }
-
         SmallVector<OpFoldResult> newParallelDims;
         for (auto i = 0UL; i < reductionDims.size(); i++) {
           newParallelDims.push_back(getAsIndexOpFoldResult(b.getContext(), i));
         }
-        auto tilingResult = linalgX::tileAllUsingForall(
+        auto tilingResult = linalgX::tileReductionUsingForall(
             b, cast<PartialReductionOpInterface>(currentOp.getOperation()), {},
             tileSizes, newParallelDims, std::nullopt);
         if (failed(tilingResult) &&
@@ -503,8 +509,8 @@ generateOuterLoop(RewriterBase &b, linalg::LinalgOp linalgOp,
             }
           }
         }
-      } else if (auto tilingInterface =
-                     cast<TilingInterface>(currentOp.getOperation())) {
+      } else {
+        auto tilingInterface = cast<TilingInterface>(currentOp.getOperation());
         auto tilingResult = linalg::tileToForallOpUsingTileSizes(
             b, tilingInterface, tileSizes, std::nullopt);
         if (failed(tilingResult))
@@ -597,11 +603,15 @@ struct deepTileMatmul : public OpInterfaceRewritePattern<linalg::LinalgOp> {
                                ? (cfg.NBlock - 1) / cfg.innerMostNBlock + 1
                                : cfg.NBlock;
     // Outer
-    option.nestedTileSizes.emplace_back(SmallVector<size_t>{
-        MParallelBlockSize, NParallelBlockSize, KParallelBlockSize});
-    option.loopType.emplace_back(OuterLoopGenerationOption::LoopType::ForallOp);
-    option.loopDim.emplace_back(
-        SmallVector<size_t>{MDimPos[0], NDimPos[0], KDimPos[0]});
+    for (auto [tile, dim] :
+         llvm::zip(SmallVector<size_t>{KParallelBlockSize, MParallelBlockSize,
+                                       NParallelBlockSize},
+                   SmallVector<size_t>{KDimPos[0], MDimPos[0], NDimPos[0]})) {
+      option.nestedTileSizes.emplace_back(SmallVector<size_t>{tile});
+      option.loopType.emplace_back(
+          OuterLoopGenerationOption::LoopType::ForallOp);
+      option.loopDim.emplace_back(SmallVector<size_t>{dim});
+    }
     // Middle
     for (auto [tile, dim] :
          llvm::zip(SmallVector<size_t>{MOuterBlockSize, NOuterBlockSize,
diff --git a/lib/gc/Transforms/MergeNestedForall.cpp b/lib/gc/Transforms/MergeNestedForall.cpp
@@ -0,0 +1,100 @@
+//===-- MergeNestedForall.cpp - DESC -------------------*- C++ -*-===//
+//
+// This file is licensed under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "mlir/Transforms/Passes.h"
+
+#include "mlir/Dialect/SCF/IR/SCF.h"
+#include "mlir/IR/Dominance.h"
+#include "mlir/Interfaces/ControlFlowInterfaces.h"
+#include "mlir/Interfaces/LoopLikeInterface.h"
+#include "mlir/Interfaces/SideEffectInterfaces.h"
+#include "mlir/Transforms/ControlFlowSinkUtils.h"
+
+namespace mlir {
+namespace gc {
+#define GEN_PASS_DEF_MERGENESTEDFORALL
+#include "gc/Transforms/Passes.h.inc"
+
+namespace {
+
+struct MergeNestedForallLoops : public OpRewritePattern<scf::ForallOp> {
+  using OpRewritePattern<scf::ForallOp>::OpRewritePattern;
+
+  LogicalResult matchAndRewrite(scf::ForallOp op,
+                                PatternRewriter &rewriter) const override {
+    Block &outerBody = *op.getBody();
+    if (!llvm::hasSingleElement(outerBody.without_terminator()))
+      return failure();
+
+    auto innerOp = dyn_cast<scf::ForallOp>(outerBody.front());
+    if (!innerOp)
+      return failure();
+
+    for (auto val : outerBody.getArguments())
+      if (llvm::is_contained(innerOp.getDynamicLowerBound(), val) ||
+          llvm::is_contained(innerOp.getDynamicUpperBound(), val) ||
+          llvm::is_contained(innerOp.getDynamicStep(), val))
+        return failure();
+
+    // Reductions are not supported yet.
+    if (!op.getInits().empty() || !innerOp.getInits().empty())
+      return failure();
+
+    auto bodyBuilder = [&](OpBuilder &builder, Location /*loc*/,
+                           ValueRange iterVals) {
+      Block &innerBody = *innerOp.getBody();
+      assert(iterVals.size() ==
+             (outerBody.getNumArguments() + innerBody.getNumArguments()));
+      IRMapping mapping;
+      mapping.map(outerBody.getArguments(),
+                  iterVals.take_front(outerBody.getNumArguments()));
+      mapping.map(innerBody.getArguments(),
+                  iterVals.take_back(innerBody.getNumArguments()));
+      for (Operation &op : innerBody)
+        builder.clone(op, mapping);
+    };
+
+    auto concatValues = [](const auto &first, const auto &second) {
+      SmallVector<OpFoldResult> ret;
+      ret.reserve(first.size() + second.size());
+      ret.assign(first.begin(), first.end());
+      ret.append(second.begin(), second.end());
+      return ret;
+    };
+
+    auto newLowerBounds =
+        concatValues(op.getMixedLowerBound(), innerOp.getMixedLowerBound());
+    auto newUpperBounds =
+        concatValues(op.getMixedUpperBound(), innerOp.getMixedUpperBound());
+    auto newSteps = concatValues(op.getMixedStep(), innerOp.getMixedStep());
+    rewriter.replaceOpWithNewOp<scf::ForallOp>(
+        op, newLowerBounds, newUpperBounds, newSteps, ValueRange{},
+        std::nullopt, bodyBuilder);
+    return success();
+  }
+};
+
+struct MergeNestedForall
+    : public impl::MergeNestedForallBase<MergeNestedForall> {
+public:
+  void runOnOperation() final {
+    auto &ctx = getContext();
+    RewritePatternSet patterns(&ctx);
+
+    patterns.add<MergeNestedForallLoops>(patterns.getContext());
+
+    if (failed(applyPatternsAndFoldGreedily(getOperation(),
+                                            std::move(patterns)))) {
+      return signalPassFailure();
+    }
+  }
+};
+
+} // namespace
+} // namespace gc
+} // namespace mlir
diff --git a/lib/gc/Transforms/Pipeline.cpp b/lib/gc/Transforms/Pipeline.cpp
@@ -32,6 +32,17 @@
 
 namespace mlir::gc {
 
+void populateCleanUpPasses(mlir::PassManager &pm) {
+  pm.addPass(createCanonicalizerPass());
+  pm.addPass(createCSEPass());
+  pm.addPass(createLoopInvariantCodeMotionPass());
+  pm.addPass(createControlFlowSinkPass());
+  pm.addPass(createCSEPass());
+  pm.addPass(createSCCPPass());
+  pm.addPass(createMem2Reg());
+  pm.addPass(createTopologicalSortPass());
+}
+
 // linalg + linalgX + tensor
 void populateFrontendPasses(mlir::OpPassManager &pm) {
   pm.addPass(createConvertOneDNNGraphToLinalg());
@@ -42,13 +53,14 @@ void populateTensorPasses(mlir::OpPassManager &pm) {
   // todo: padding propagation pass
   // todo: layout propagation pass
   // todo: tensor constant propagation pass
-  // todo: linalg.matmul lowering to (scf.loop + linalg.brgemm) pass
+  pm.addNestedPass<func::FuncOp>(createDeepTileContractionNamedOp());
   // todo: fine-grain fusion pass
   // todo: lower linalg to arith/math on virtual vector pass
 
   // REMOVE this pass after the above passes are added. Currently we add this
   // pass to make the pipeline work properly
   pm.addNestedPass<func::FuncOp>(createLinalgGeneralizeNamedOpsPass());
+  populateCleanUpPasses(pm);
 }
 
 // scf + arith + math + vector + tensor + linalg.brgemm
@@ -67,6 +79,7 @@ void populateVectorPasses(mlir::OpPassManager &pm) {
   // oneDNN graph spec
   pm.addNestedPass<func::FuncOp>(arith::createArithExpandOpsPass());
   // todo: lower to physical vector pass, device dependent pass
+  populateCleanUpPasses(pm);
 }
 
 // scf + arith + math + vector + memref + linalg.brgemm
@@ -86,6 +99,7 @@ void populateBufferizationPasses(mlir::OpPassManager &pm) {
   pm.addNestedPass<func::FuncOp>(bufferization::createBufferLoopHoistingPass());
   pm.addNestedPass<func::FuncOp>(bufferization::createBufferDeallocationPass());
   pm.addPass(createBufferizationToMemRefPass());
+  populateCleanUpPasses(pm);
 }
 
 // scf + arith + math + vector + memref + func/microkernel
@@ -102,6 +116,12 @@ void populateMicroKernelPasses(mlir::OpPassManager &pm) {
 void populateCPURuntimePasses(mlir::OpPassManager &pm) {
   // todo: flatten nested parallel pass to support coarse-grain usion
   // remove this pass after we add FlattenNestedParallel
+  pm.addPass(createSinkOpIntoInnerLoop());
+  pm.addPass(createMergeNestedForall());
+  populateCleanUpPasses(pm);
+  pm.addPass(createForallToParallelLoopPass());
+  pm.addPass(createParallelLoopFusionPass());
+  pm.addPass(createLoopInvariantCodeMotionPass());
   pm.addPass(createConvertSCFToOpenMPPass());
 }
 
@@ -141,7 +161,7 @@ void populateCPUPipeline(mlir::OpPassManager &pm) {
   pm.addNestedPass<func::FuncOp>(createConvertLinalgToParallelLoopsPass());
   populateMicroKernelPasses(pm);
   populateCPURuntimePasses(pm);
-  // // back-end, llvm dialect
+  // back-end, llvm dialect
   populateLLVMPasses(pm);
 }
 
diff --git a/lib/gc/Transforms/SinkOpIntoInnerLoop.cpp b/lib/gc/Transforms/SinkOpIntoInnerLoop.cpp
@@ -0,0 +1,51 @@
+//===-- SinkOpIntoInnerLoop.cpp - DESC -------------------*- C++ -*-===//
+//
+// This file is licensed under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "mlir/Transforms/Passes.h"
+
+#include "mlir/IR/Dominance.h"
+#include "mlir/Interfaces/ControlFlowInterfaces.h"
+#include "mlir/Interfaces/LoopLikeInterface.h"
+#include "mlir/Interfaces/SideEffectInterfaces.h"
+#include "mlir/Transforms/ControlFlowSinkUtils.h"
+
+namespace mlir {
+namespace gc {
+#define GEN_PASS_DEF_SINKOPINTOINNERLOOP
+#include "gc/Transforms/Passes.h.inc"
+
+namespace {
+
+struct SinkOpIntoInnerLoop
+    : public impl::SinkOpIntoInnerLoopBase<SinkOpIntoInnerLoop> {
+public:
+  void runOnOperation() final {
+    auto &domInfo = getAnalysis<DominanceInfo>();
+    getOperation()->walk([&](LoopLikeOpInterface loop) {
+      SmallVector<Region *> regionsToSink;
+      // Get the regions are that known to be executed at most once.
+      for (auto &it : loop->getRegions()) {
+        regionsToSink.push_back(&it);
+      }
+      // Sink side-effect free operations.
+      controlFlowSink(
+          regionsToSink, domInfo,
+          [](Operation *op, Region *) { return isMemoryEffectFree(op); },
+          [](Operation *op, Region *region) {
+            // Move the operation to the beginning of the region's entry block.
+            // This guarantees the preservation of SSA dominance of all of the
+            // operation's uses are in the region.
+            op->moveBefore(&region->front(), region->front().begin());
+          });
+    });
+  }
+};
+
+} // namespace
+} // namespace gc
+} // namespace mlir
diff --git a/lib/gc/Transforms/Tiling.cpp b/lib/gc/Transforms/Tiling.cpp