[WIP][flang][OpenMP] Experimental pass to map do concurrent to OMP

ergawy · ergawy · commit 4be76ecc5daf · 2024-01-08T02:42:16.000-06:00
Adds a pass to map `do concurrent` to OpenMP worksharing consturcts. For
now, only maps basic loops to `omp parallel do`. This is still a WIP
with more work needed for testing and mapping more advanced loops.
diff --git a/flang/include/flang/Optimizer/HLFIR/HLFIROps.td b/flang/include/flang/Optimizer/HLFIR/HLFIROps.td
@@ -75,7 +75,7 @@ def hlfir_DeclareOp : hlfir_Op<"declare", [AttrSizedOperandSegments,
     func.func @foo(%arg0: !fir.ref<!fir.array<?x?x!fir.char<1,?>>>, %arg1: !fir.ref<i64>) {
       %c10 = arith.constant 10 : index
       %c20 = arith.constant 20 : index
-      %1 = fir.load %ag1 : fir.ref<i64>
+      %1 = fir.load %arg1 : fir.ref<i64>
       %2 = fir.shape_shift %c10, %1, %c20, %1 : (index, index, index, index) -> !fir.shapeshift<2>
       %3 = hfir.declare %arg0(%2) typeparams %1 {uniq_name = "c"} (fir.ref<!fir.array<?x?x!fir.char<1,?>>>, fir.shapeshift<2>, index) -> (fir.box<!fir.array<?x?x!fir.char<1,?>>>, fir.ref<!fir.array<?x?x!fir.char<1,?>>>)
       // ... uses %3#0 as "c"
diff --git a/flang/include/flang/Optimizer/Transforms/Passes.h b/flang/include/flang/Optimizer/Transforms/Passes.h
@@ -93,6 +93,8 @@ std::unique_ptr<mlir::Pass> createFunctionAttrPass();
 std::unique_ptr<mlir::Pass>
 createFunctionAttrPass(FunctionAttrTypes &functionAttr);
 
+std::unique_ptr<mlir::Pass> createDoConcurrentConversionPass();
+
 // declarative passes
 #define GEN_PASS_REGISTRATION
 #include "flang/Optimizer/Transforms/Passes.h.inc"
diff --git a/flang/include/flang/Optimizer/Transforms/Passes.td b/flang/include/flang/Optimizer/Transforms/Passes.td
@@ -370,4 +370,24 @@ def FunctionAttr : Pass<"function-attr", "mlir::func::FuncOp"> {
   let constructor = "::fir::createFunctionAttrPass()";
 }
 
+def DoConcurrentConversionPass : Pass<"fopenmp-do-concurrent-conversion", "mlir::func::FuncOp"> {
+  let summary = "Map `DO CONCURRENT` loops to OpenMP worksharing loops.";
+
+  let description = [{ This is an experimental pass to map `DO CONCURRENR` loops
+     to their correspnding equivalent OpenMP worksharing constructs.
+
+     For now the following is supported:
+       - Mapping simple loops to `parallel do`.
+
+     Still to TODO:
+       - More extensive testing.
+       - Mapping to `target teams distribute parallel do`.
+       - Allowing the user to control mapping behavior: either to the host or
+         target.
+  }];
+
+  let constructor = "::fir::createDoConcurrentConversionPass()";
+  let dependentDialects = ["mlir::omp::OpenMPDialect"];
+}
+
 #endif // FLANG_OPTIMIZER_TRANSFORMS_PASSES
diff --git a/flang/lib/Optimizer/Transforms/CMakeLists.txt b/flang/lib/Optimizer/Transforms/CMakeLists.txt
@@ -21,6 +21,7 @@ add_flang_library(FIRTransforms
   OMPMarkDeclareTarget.cpp
   VScaleAttr.cpp
   FunctionAttr.cpp
+  DoConcurrentConversion.cpp
 
   DEPENDS
   FIRDialect
diff --git a/flang/lib/Optimizer/Transforms/DoConcurrentConversion.cpp b/flang/lib/Optimizer/Transforms/DoConcurrentConversion.cpp
@@ -0,0 +1,162 @@
+//===- DoConcurrentConversion.cpp -- map `DO CONCURRENT` to OpenMP loops --===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "flang/Optimizer/Dialect/FIRDialect.h"
+#include "flang/Optimizer/Dialect/FIROps.h"
+#include "flang/Optimizer/Dialect/FIRType.h"
+#include "flang/Optimizer/Dialect/Support/FIRContext.h"
+#include "flang/Optimizer/HLFIR/HLFIRDialect.h"
+#include "flang/Optimizer/Transforms/Passes.h"
+#include "mlir/Dialect/Func/IR/FuncOps.h"
+#include "mlir/Dialect/OpenMP/OpenMPDialect.h"
+#include "mlir/IR/Diagnostics.h"
+#include "mlir/IR/IRMapping.h"
+#include "mlir/Pass/Pass.h"
+#include "mlir/Transforms/DialectConversion.h"
+
+#include <memory>
+
+namespace fir {
+#define GEN_PASS_DEF_DOCONCURRENTCONVERSIONPASS
+#include "flang/Optimizer/Transforms/Passes.h.inc"
+} // namespace fir
+
+#define DEBUG_TYPE "fopenmp-do-concurrent-conversion"
+
+namespace {
+class DoConcurrentConversion : public mlir::OpConversionPattern<fir::DoLoopOp> {
+public:
+  using mlir::OpConversionPattern<fir::DoLoopOp>::OpConversionPattern;
+
+  mlir::LogicalResult
+  matchAndRewrite(fir::DoLoopOp doLoop, OpAdaptor adaptor,
+                  mlir::ConversionPatternRewriter &rewriter) const override {
+    mlir::OpPrintingFlags flags;
+    flags.printGenericOpForm();
+
+    mlir::omp::ParallelOp parallelOp =
+        rewriter.create<mlir::omp::ParallelOp>(doLoop.getLoc());
+
+    rewriter.createBlock(&parallelOp.getRegion());
+    mlir::Block &block = parallelOp.getRegion().back();
+
+    rewriter.setInsertionPointToEnd(&block);
+    rewriter.create<mlir::omp::TerminatorOp>(doLoop.getLoc());
+
+    rewriter.setInsertionPointToStart(&block);
+
+    // Clone the LB, UB, step defining ops inside the parallel region.
+    llvm::SmallVector<mlir::Value> lowerBound, upperBound, step;
+    lowerBound.push_back(
+        rewriter.clone(*doLoop.getLowerBound().getDefiningOp())->getResult(0));
+    upperBound.push_back(
+        rewriter.clone(*doLoop.getUpperBound().getDefiningOp())->getResult(0));
+    step.push_back(
+        rewriter.clone(*doLoop.getStep().getDefiningOp())->getResult(0));
+
+    auto wsLoopOp = rewriter.create<mlir::omp::WsLoopOp>(
+        doLoop.getLoc(), lowerBound, upperBound, step);
+    wsLoopOp.setInclusive(true);
+
+    auto outlineableOp =
+        mlir::dyn_cast<mlir::omp::OutlineableOpenMPOpInterface>(*parallelOp);
+    assert(outlineableOp);
+    rewriter.setInsertionPointToStart(outlineableOp.getAllocaBlock());
+
+    // For the induction variable, we need to privative its allocation and
+    // binding inside the parallel region.
+    llvm::SmallSetVector<mlir::Operation *, 2> workList;
+    // Therefore, we first discover the induction variable by discovering
+    // `fir.store`s where the source is the loop's block argument.
+    workList.insert(doLoop.getInductionVar().getUsers().begin(),
+                    doLoop.getInductionVar().getUsers().end());
+    llvm::SmallSetVector<fir::StoreOp, 2> inductionVarTargetStores;
+
+    // Walk the def-chain of the loop's block argument until we hit `fir.store`.
+    while (!workList.empty()) {
+      mlir::Operation *item = workList.front();
+
+      if (auto storeOp = mlir::dyn_cast<fir::StoreOp>(item)) {
+        inductionVarTargetStores.insert(storeOp);
+      } else {
+        workList.insert(item->getUsers().begin(), item->getUsers().end());
+      }
+
+      workList.remove(item);
+    }
+
+    // For each collected `fir.sotre`, find the target memref's alloca's and
+    // declare ops.
+    llvm::SmallSetVector<mlir::Operation *, 4> declareAndAllocasToClone;
+    for (auto storeOp : inductionVarTargetStores) {
+      mlir::Operation *storeTarget = storeOp.getMemref().getDefiningOp();
+
+      for (auto operand : storeTarget->getOperands()) {
+        declareAndAllocasToClone.insert(operand.getDefiningOp());
+      }
+      declareAndAllocasToClone.insert(storeTarget);
+    }
+
+    mlir::IRMapping mapper;
+
+    // Collect the memref defining ops in the parallel region.
+    for (mlir::Operation *opToClone : declareAndAllocasToClone) {
+      rewriter.clone(*opToClone, mapper);
+    }
+
+    // Clone the loop's body inside the worksharing construct using the mapped
+    // memref values.
+    rewriter.cloneRegionBefore(doLoop.getRegion(), wsLoopOp.getRegion(),
+                               wsLoopOp.getRegion().begin(), mapper);
+
+    mlir::Operation *terminator = wsLoopOp.getRegion().back().getTerminator();
+    rewriter.setInsertionPointToEnd(&wsLoopOp.getRegion().back());
+    rewriter.create<mlir::omp::YieldOp>(terminator->getLoc());
+    rewriter.eraseOp(terminator);
+
+    rewriter.eraseOp(doLoop);
+
+    return mlir::success();
+  }
+};
+
+class DoConcurrentConversionPass
+    : public fir::impl::DoConcurrentConversionPassBase<
+          DoConcurrentConversionPass> {
+public:
+  void runOnOperation() override {
+    mlir::func::FuncOp func = getOperation();
+
+    if (func.isDeclaration()) {
+      return;
+    }
+
+    auto *context = &getContext();
+    mlir::RewritePatternSet patterns(context);
+    patterns.insert<DoConcurrentConversion>(context);
+    mlir::ConversionTarget target(*context);
+    target.addLegalDialect<fir::FIROpsDialect, hlfir::hlfirDialect,
+                           mlir::arith::ArithDialect, mlir::func::FuncDialect,
+                           mlir::omp::OpenMPDialect>();
+
+    target.addDynamicallyLegalOp<fir::DoLoopOp>(
+        [](fir::DoLoopOp op) { return !op.getUnordered(); });
+
+    if (mlir::failed(mlir::applyFullConversion(getOperation(), target,
+                                               std::move(patterns)))) {
+      mlir::emitError(mlir::UnknownLoc::get(context),
+                      "error in converting do-concurrent op");
+      signalPassFailure();
+    }
+  }
+};
+} // namespace
+
+std::unique_ptr<mlir::Pass> fir::createDoConcurrentConversionPass() {
+  return std::make_unique<DoConcurrentConversionPass>();
+}
diff --git a/flang/test/Transforms/DoConcurrent/basic.mlir b/flang/test/Transforms/DoConcurrent/basic.mlir
@@ -0,0 +1,60 @@
+// Tests mapping of a basic `do concurrent` loop to `!$omp parallel do`.
+
+// RUN: fir-opt --fopenmp-do-concurrent-conversion %s | FileCheck %s
+
+// CHECK-LABEL: func.func @do_concurrent_basic
+func.func @do_concurrent_basic() attributes {fir.bindc_name = "do_concurrent_basic"} {
+    // CHECK: %[[ARR:.*]]:2 = hlfir.declare %{{.*}}(%{{.*}}) {uniq_name = "_QFEa"} : (!fir.ref<!fir.array<10xi32>>, !fir.shape<1>) -> (!fir.ref<!fir.array<10xi32>>, !fir.ref<!fir.array<10xi32>>)
+    // CHECK: %[[C1:.*]] = arith.constant 1 : i32
+    // CHECK: %[[C10:.*]] = arith.constant 10 : i32
+
+    %0 = fir.alloca i32 {bindc_name = "i"}
+    %1:2 = hlfir.declare %0 {uniq_name = "_QFEi"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
+    %2 = fir.address_of(@_QFEa) : !fir.ref<!fir.array<10xi32>>
+    %c10 = arith.constant 10 : index
+    %3 = fir.shape %c10 : (index) -> !fir.shape<1>
+    %4:2 = hlfir.declare %2(%3) {uniq_name = "_QFEa"} : (!fir.ref<!fir.array<10xi32>>, !fir.shape<1>) -> (!fir.ref<!fir.array<10xi32>>, !fir.ref<!fir.array<10xi32>>)
+    %c1_i32 = arith.constant 1 : i32
+    %7 = fir.convert %c1_i32 : (i32) -> index
+    %c10_i32 = arith.constant 10 : i32
+    %8 = fir.convert %c10_i32 : (i32) -> index
+    %c1 = arith.constant 1 : index
+
+    // CHECK-NOT: fir.do_loop
+
+    // CHECK: omp.parallel {
+
+    // CHECK-NEXT: %[[ITER_VAR:.*]] = fir.alloca i32 {bindc_name = "i"}
+    // CHECK-NEXT: %[[BINDING:.*]]:2 = hlfir.declare %[[ITER_VAR]] {uniq_name = "_QFEi"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
+
+    // CHECK: %[[LB:.*]] = fir.convert %[[C1]] : (i32) -> index
+    // CHECK: %[[UB:.*]] = fir.convert %[[C10]] : (i32) -> index
+    // CHECK: %[[STEP:.*]] = arith.constant 1 : index
+
+     // CHECK: omp.wsloop for (%[[ARG0:.*]]) : index = (%[[LB]]) to (%[[UB]]) inclusive step (%[[STEP]]) {
+     // CHECK-NEXT: %[[IV_IDX:.*]] = fir.convert %[[ARG0]] : (index) -> i32
+     // CHECK-NEXT: fir.store %[[IV_IDX]] to %[[BINDING]]#1 : !fir.ref<i32>
+     // CHECK-NEXT: %[[IV_VAL1:.*]] = fir.load %[[BINDING]]#0 : !fir.ref<i32>
+     // CHECK-NEXT: %[[IV_VAL2:.*]] = fir.load %[[BINDING]]#0 : !fir.ref<i32>
+     // CHECK-NEXT: %[[IV_VAL_I64:.*]] = fir.convert %[[IV_VAL2]] : (i32) -> i64
+     // CHECK-NEXT: %[[ARR_ACCESS:.*]] = hlfir.designate %[[ARR]]#0 (%[[IV_VAL_I64]])  : (!fir.ref<!fir.array<10xi32>>, i64) -> !fir.ref<i32>
+     // CHECK-NEXT: hlfir.assign %[[IV_VAL1]] to %[[ARR_ACCESS]] : i32, !fir.ref<i32>
+     // CHECK-NEXT: omp.yield
+     // CHECK-NEXT: }
+
+     // CHECK-NEXT: omp.terminator
+     // CHECK-NEXT: }
+    fir.do_loop %arg0 = %7 to %8 step %c1 unordered {
+      %13 = fir.convert %arg0 : (index) -> i32
+      fir.store %13 to %1#1 : !fir.ref<i32>
+      %14 = fir.load %1#0 : !fir.ref<i32>
+      %15 = fir.load %1#0 : !fir.ref<i32>
+      %16 = fir.convert %15 : (i32) -> i64
+      %17 = hlfir.designate %4#0 (%16)  : (!fir.ref<!fir.array<10xi32>>, i64) -> !fir.ref<i32>
+      hlfir.assign %14 to %17 : i32, !fir.ref<i32>
+    }
+
+    // CHECK-NOT: fir.do_loop
+
+    return
+  }