[MLIR][SCF][OpenMP] Update reduction conversion

kiranchandramohan · kiranchandramohan · commit 465cdc8b93a3 · 2024-02-03T19:19:52.000Z
diff --git a/mlir/lib/Conversion/SCFToOpenMP/SCFToOpenMP.cpp b/mlir/lib/Conversion/SCFToOpenMP/SCFToOpenMP.cpp
@@ -367,9 +367,11 @@ struct ParallelOpLowering : public OpRewritePattern<scf::ParallelOp> {
     // TODO: consider checking it here is already a compatible reduction
     // declaration and use it instead of redeclaring.
     SmallVector<Attribute> reductionDeclSymbols;
+    SmallVector<omp::ReductionDeclareOp> ompReductionDecls;
     auto reduce = cast<scf::ReduceOp>(parallelOp.getBody()->getTerminator());
     for (int64_t i = 0, e = parallelOp.getNumReductions(); i < e; ++i) {
       omp::ReductionDeclareOp decl = declareReduction(rewriter, reduce, i);
+      ompReductionDecls.push_back(decl);
       if (!decl)
         return failure();
       reductionDeclSymbols.push_back(
@@ -398,11 +400,39 @@ struct ParallelOpLowering : public OpRewritePattern<scf::ParallelOp> {
     // Replace the reduction operations contained in this loop. Must be done
     // here rather than in a separate pattern to have access to the list of
     // reduction variables.
+    unsigned int reductionIndex = 0;
     for (auto [x, y] :
          llvm::zip_equal(reductionVariables, reduce.getOperands())) {
       OpBuilder::InsertionGuard guard(rewriter);
       rewriter.setInsertionPoint(reduce);
-      rewriter.create<omp::ReductionOp>(reduce.getLoc(), y, x);
+      Region &redRegion =
+          ompReductionDecls[reductionIndex].getReductionRegion();
+      assert(redRegion.hasOneBlock() &&
+             "expect reduction region to have one block");
+      Value pvtRedVar = parallelOp.getRegion().addArgument(x.getType(), loc);
+      Value pvtRedVal = rewriter.create<LLVM::LoadOp>(
+          reduce.getLoc(), ompReductionDecls[reductionIndex].getType(),
+          pvtRedVar);
+      // Make a copy of the reduction combiner region in the body
+      mlir::OpBuilder builder(rewriter.getContext());
+      builder.setInsertionPoint(reduce);
+      mlir::IRMapping mapper;
+      assert(redRegion.getNumArguments() == 2 &&
+             "expect reduction region to have two arguments");
+      mapper.map(redRegion.getArgument(0), pvtRedVal);
+      mapper.map(redRegion.getArgument(1), y);
+      for (auto &op : redRegion.getOps()) {
+        Operation *cloneOp = builder.clone(op, mapper);
+        if (auto yieldOp = dyn_cast<omp::YieldOp>(*cloneOp)) {
+          assert(yieldOp && yieldOp.getResults().size() == 1 &&
+                 "expect YieldOp in reduction region to return one result");
+          Value redVal = yieldOp.getResults()[0];
+          rewriter.create<LLVM::StoreOp>(loc, redVal, pvtRedVar);
+          rewriter.eraseOp(yieldOp);
+          break;
+        }
+      }
+      reductionIndex++;
     }
     rewriter.eraseOp(reduce);
 
diff --git a/mlir/test/Conversion/SCFToOpenMP/reductions.mlir b/mlir/test/Conversion/SCFToOpenMP/reductions.mlir
@@ -27,13 +27,15 @@ func.func @reduction1(%arg0 : index, %arg1 : index, %arg2 : index,
   %zero = arith.constant 0.0 : f32
   // CHECK: omp.parallel
   // CHECK: omp.wsloop
-  // CHECK-SAME: reduction(@[[$REDF]] -> %[[BUF]]
+  // CHECK-SAME: reduction(@[[$REDF]] %[[BUF]] -> %[[PVT_BUF:[a-z0-9]+]]
   // CHECK: memref.alloca_scope
   scf.parallel (%i0, %i1) = (%arg0, %arg1) to (%arg2, %arg3)
                             step (%arg4, %step) init (%zero) -> (f32) {
     // CHECK: %[[CST_INNER:.*]] = arith.constant 1.0
     %one = arith.constant 1.0 : f32
-    // CHECK: omp.reduction %[[CST_INNER]], %[[BUF]]
+    // CHECK: %[[PVT_VAL:.*]] = llvm.load %[[PVT_BUF]] : !llvm.ptr -> f32
+    // CHECK: %[[ADD_RESULT:.*]] = arith.addf %[[PVT_VAL]], %[[CST_INNER]] : f32
+    // CHECK: llvm.store %[[ADD_RESULT]], %[[PVT_BUF]] : f32, !llvm.ptr
     scf.reduce(%one : f32) {
     ^bb0(%lhs : f32, %rhs: f32):
       %res = arith.addf %lhs, %rhs : f32
@@ -103,10 +105,15 @@ func.func @reduction_muli(%arg0 : index, %arg1 : index, %arg2 : index,
                  %arg3 : index, %arg4 : index) {
   %step = arith.constant 1 : index
   %one = arith.constant 1 : i32
+  // CHECK: %[[RED_VAR:.*]] = llvm.alloca %{{.*}} x i32 : (i64) -> !llvm.ptr
+  // CHECK: omp.wsloop reduction(@[[$REDI]] %[[RED_VAR]] -> %[[RED_PVT_VAR:.*]] : !llvm.ptr)
   scf.parallel (%i0, %i1) = (%arg0, %arg1) to (%arg2, %arg3)
                             step (%arg4, %step) init (%one) -> (i32) {
-    // CHECK: omp.reduction
+    // CHECK: %[[C2:.*]] = arith.constant 2 : i32
     %pow2 = arith.constant 2 : i32
+    // CHECK: %[[RED_PVT_VAL:.*]] = llvm.load %[[RED_PVT_VAR]] : !llvm.ptr -> i32
+    // CHECK: %[[MUL_RESULT:.*]] = arith.muli %[[RED_PVT_VAL]], %[[C2]] : i32
+    // CHECK: llvm.store %[[MUL_RESULT]], %[[RED_PVT_VAR]] : i32, !llvm.ptr
     scf.reduce(%pow2 : i32) {
     ^bb0(%lhs : i32, %rhs: i32):
       %res = arith.muli %lhs, %rhs : i32
@@ -199,16 +206,23 @@ func.func @reduction4(%arg0 : index, %arg1 : index, %arg2 : index,
 
   // CHECK: omp.parallel
   // CHECK: omp.wsloop
-  // CHECK-SAME: reduction(@[[$REDF1]] -> %[[BUF1]]
-  // CHECK-SAME:           @[[$REDF2]] -> %[[BUF2]]
+  // CHECK-SAME: reduction(@[[$REDF1]] %[[BUF1]] -> %[[PVT_BUF1:[a-z0-9]+]]
+  // CHECK-SAME:           @[[$REDF2]] %[[BUF2]] -> %[[PVT_BUF2:[a-z0-9]+]]
   // CHECK: memref.alloca_scope
   %res:2 = scf.parallel (%i0, %i1) = (%arg0, %arg1) to (%arg2, %arg3)
                         step (%arg4, %step) init (%zero, %ione) -> (f32, i64) {
+    // CHECK: %[[CST_ONE:.*]] = arith.constant 1.0{{.*}} : f32
     %one = arith.constant 1.0 : f32
-    // CHECK: arith.fptosi
+    // CHECK: %[[CST_INT_ONE:.*]] = arith.fptosi
     %1 = arith.fptosi %one : f32 to i64
-    // CHECK: omp.reduction %{{.*}}, %[[BUF1]]
-    // CHECK: omp.reduction %{{.*}}, %[[BUF2]]
+    // CHECK: %[[PVT_VAL1:.*]] = llvm.load %[[PVT_BUF1]] : !llvm.ptr -> f32
+    // CHECK: %[[TEMP1:.*]] = arith.cmpf oge, %[[PVT_VAL1]], %[[CST_ONE]] : f32
+    // CHECK: %[[CMP_VAL1:.*]] = arith.select %[[TEMP1]], %[[PVT_VAL1]], %[[CST_ONE]] : f32
+    // CHECK: llvm.store %[[CMP_VAL1]], %[[PVT_BUF1]] : f32, !llvm.ptr
+    // CHECK: %[[PVT_VAL2:.*]] = llvm.load %[[PVT_BUF2]] : !llvm.ptr -> i64
+    // CHECK: %[[TEMP2:.*]] = arith.cmpi slt, %[[PVT_VAL2]], %[[CST_INT_ONE]] : i64
+    // CHECK: %[[CMP_VAL2:.*]] = arith.select %[[TEMP2]], %[[CST_INT_ONE]], %[[PVT_VAL2]] : i64
+    // CHECK: llvm.store %[[CMP_VAL2]], %[[PVT_BUF2]] : i64, !llvm.ptr
     scf.reduce(%one, %1 : f32, i64) {
     ^bb0(%lhs : f32, %rhs: f32):
       %cmp = arith.cmpf oge, %lhs, %rhs : f32