Skip to content

[flang] Add reductions for CUF Kernels: Lowering #95184

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 6 commits into from
Jun 12, 2024

Conversation

ImanHosseini
Copy link
Contributor

  • Add reductionOperands and reductionAttrs to cuf's KernelOp.
  • Parsing is already working and the tree has the info: here I make the Bridge emit the updated KernelOp with reduction information added.
  • Check |reductionAttrs| = |reductionOperands| in verifier
  • Add a test
    @clementval @vzakhari

@ImanHosseini ImanHosseini requested a review from clementval June 12, 2024 00:09
@llvmbot llvmbot added flang Flang issues not falling into any other category flang:fir-hlfir labels Jun 12, 2024
@llvmbot
Copy link
Member

llvmbot commented Jun 12, 2024

@llvm/pr-subscribers-flang-fir-hlfir

Author: Iman Hosseini (ImanHosseini)

Changes
  • Add reductionOperands and reductionAttrs to cuf's KernelOp.
  • Parsing is already working and the tree has the info: here I make the Bridge emit the updated KernelOp with reduction information added.
  • Check |reductionAttrs| = |reductionOperands| in verifier
  • Add a test
    @clementval @vzakhari

Full diff: https://github.com/llvm/llvm-project/pull/95184.diff

4 Files Affected:

  • (modified) flang/include/flang/Optimizer/Dialect/CUF/CUFOps.td (+22-1)
  • (modified) flang/lib/Lower/Bridge.cpp (+32-2)
  • (modified) flang/lib/Optimizer/Dialect/CUF/CUFOps.cpp (+3-1)
  • (added) flang/test/Lower/cuf_kernel_do_reduction.f90 (+37)
diff --git a/flang/include/flang/Optimizer/Dialect/CUF/CUFOps.td b/flang/include/flang/Optimizer/Dialect/CUF/CUFOps.td
index 37b8da0181955..5c27b2e7f2938 100644
--- a/flang/include/flang/Optimizer/Dialect/CUF/CUFOps.td
+++ b/flang/include/flang/Optimizer/Dialect/CUF/CUFOps.td
@@ -17,6 +17,7 @@
 include "flang/Optimizer/Dialect/CUF/CUFDialect.td"
 include "flang/Optimizer/Dialect/CUF/Attributes/CUFAttr.td"
 include "flang/Optimizer/Dialect/FIRTypes.td"
+include "flang/Optimizer/Dialect/FIRAttr.td"
 include "mlir/Interfaces/LoopLikeInterface.td"
 include "mlir/IR/BuiltinAttributes.td"
 
@@ -249,7 +250,9 @@ def cuf_KernelOp : cuf_Op<"kernel", [AttrSizedOperandSegments,
     Variadic<Index>:$lowerbound,
     Variadic<Index>:$upperbound,
     Variadic<Index>:$step,
-    OptionalAttr<I64Attr>:$n
+    OptionalAttr<I64Attr>:$n,
+    Variadic<AnyType>:$reduceOperands,
+    OptionalAttr<ArrayAttr>:$reduceAttrs
   );
 
   let regions = (region AnyRegion:$region);
@@ -260,9 +263,27 @@ def cuf_KernelOp : cuf_Op<"kernel", [AttrSizedOperandSegments,
         ( `,` `stream` `=` $stream^ )? `>` `>` `>`
         custom<CUFKernelLoopControl>($region, $lowerbound, type($lowerbound),
             $upperbound, type($upperbound), $step, type($step))
+        `reduce_oprnds` $reduceOperands `:` type($reduceOperands)
         attr-dict
   }];
 
+  let extraClassDeclaration = [{
+    /// Get Number of variadic operands
+    unsigned getNumOperands(unsigned idx) {
+      auto segments = (*this)->getAttrOfType<mlir::DenseI32ArrayAttr>(
+        getOperandSegmentSizeAttr());
+      return static_cast<unsigned>(segments[idx]);
+    }
+    // Get Number of reduction operands
+    unsigned getNumReduceOperands() {
+      return getNumOperands(7);
+    }
+    /// Does the operation hold operands for reduction variables
+    bool hasReduceOperands() {
+      return getNumReduceOperands() > 0;
+    }
+  }];
+
   let hasVerifier = 1;
 }
 
diff --git a/flang/lib/Lower/Bridge.cpp b/flang/lib/Lower/Bridge.cpp
index 202efa57d4a36..27c80bf3788b3 100644
--- a/flang/lib/Lower/Bridge.cpp
+++ b/flang/lib/Lower/Bridge.cpp
@@ -2669,6 +2669,35 @@ class FirConverter : public Fortran::lower::AbstractConverter {
         std::get<2>(dir.t);
     const std::optional<Fortran::parser::ScalarIntExpr> &stream =
         std::get<3>(dir.t);
+    const std::list<Fortran::parser::CUFReduction> &cufreds =
+        std::get<4>(dir.t);
+
+    llvm::SmallVector<mlir::Value> reduceOperands;
+    llvm::SmallVector<mlir::Attribute> reduceAttrs;
+
+    for (const Fortran::parser::CUFReduction &cufred : cufreds) {
+      fir::ReduceOperationEnum redOpEnum = getReduceOperationEnum(
+          std::get<Fortran::parser::ReductionOperator>(cufred.t));
+      const std::list<Fortran::parser::Scalar<Fortran::parser::Variable>>
+          &scalarvars = std::get<1>(cufred.t);
+      for (const Fortran::parser::Scalar<Fortran::parser::Variable> &scalarvar :
+           scalarvars) {
+        auto reduce_attr =
+            fir::ReduceAttr::get(builder->getContext(), redOpEnum);
+        reduceAttrs.push_back(reduce_attr);
+        const Fortran::parser::Variable &var = scalarvar.thing;
+        if (const auto *iDesignator = std::get_if<
+                Fortran::common::Indirection<Fortran::parser::Designator>>(
+                &var.u)) {
+          const Fortran::parser::Designator &designator = iDesignator->value();
+          if (const auto *name =
+                  Fortran::semantics::getDesignatorNameIfDataRef(designator)) {
+            auto val = getSymbolAddress(*name->symbol);
+            reduceOperands.push_back(val);
+          }
+        }
+      }
+    }
 
     auto isOnlyStars =
         [&](const std::list<Fortran::parser::CUFKernelDoConstruct::StarOrExpr>
@@ -2771,8 +2800,9 @@ class FirConverter : public Fortran::lower::AbstractConverter {
         loopEval = &*std::next(loopEval->getNestedEvaluations().begin());
     }
 
-    auto op = builder->create<cuf::KernelOp>(loc, gridValues, blockValues,
-                                             streamValue, lbs, ubs, steps, n);
+    auto op = builder->create<cuf::KernelOp>(
+        loc, gridValues, blockValues, streamValue, lbs, ubs, steps, n,
+        mlir::ValueRange(reduceOperands), builder->getArrayAttr(reduceAttrs));
     builder->createBlock(&op.getRegion(), op.getRegion().end(), ivTypes,
                          ivLocs);
     mlir::Block &b = op.getRegion().back();
diff --git a/flang/lib/Optimizer/Dialect/CUF/CUFOps.cpp b/flang/lib/Optimizer/Dialect/CUF/CUFOps.cpp
index 2c0c4c2cfae34..a807e21def27a 100644
--- a/flang/lib/Optimizer/Dialect/CUF/CUFOps.cpp
+++ b/flang/lib/Optimizer/Dialect/CUF/CUFOps.cpp
@@ -227,7 +227,9 @@ mlir::LogicalResult cuf::KernelOp::verify() {
       getLowerbound().size() != getStep().size())
     return emitOpError(
         "expect same number of values in lowerbound, upperbound and step");
-
+  if (getReduceOperands().size() != getReduceAttrs()->size())
+    return emitOpError("expect same number of values in reduce operands and "
+                       "reduce attributes");
   return mlir::success();
 }
 
diff --git a/flang/test/Lower/cuf_kernel_do_reduction.f90 b/flang/test/Lower/cuf_kernel_do_reduction.f90
new file mode 100644
index 0000000000000..7088c1df6cae3
--- /dev/null
+++ b/flang/test/Lower/cuf_kernel_do_reduction.f90
@@ -0,0 +1,37 @@
+! Test CUDA Fortran kernel do reduction
+! RUN: bbc -emit-fir -fcuda -o - %s | FileCheck %s
+
+module mod1
+contains
+   subroutine host_sub()
+      integer, parameter :: asize = 4
+      integer, device :: adev(asize) 
+      integer :: ahost(asize)
+      integer :: q
+      integer, device :: add_reduce_var
+      integer, device :: mul_reduce_var
+      ! CHECK: %[[VAL_0:.*]] = fir.declare %{{.*}} {data_attr = #cuf.cuda<device>, uniq_name = "_QMmod1Fhost_subEadd_reduce_var"} : (!fir.ref<i32>) -> !fir.ref<i32>
+      ! CHECK: %[[VAL_1:.*]] = fir.declare %{{.*}} {data_attr = #cuf.cuda<device>, uniq_name = "_QMmod1Fhost_subEmul_reduce_var"} : (!fir.ref<i32>) -> !fir.ref<i32>
+      do i = 1, asize
+         ahost(i) = i
+      enddo
+      adev = ahost 
+      add_reduce_var = 0.0
+      mul_reduce_var = 1.0
+      ! CHECK:     } reduce_oprnds %[[VAL_0:.*]], %[[VAL_1:.*]] : !fir.ref<i32>, !fir.ref<i32> {reduceAttrs = [#fir.reduce_attr<add>, #fir.reduce_attr<multiply>]}
+      !$cuf kernel do <<< *, * >>> reduce(+:add_reduce_var) reduce(*:mul_reduce_var)
+      do i = 1, asize
+         add_reduce_var = add_reduce_var + adev(i)
+         mul_reduce_var = mul_reduce_var * adev(i)
+      end do
+      q = rsum
+      ahost = adev 
+      print *, q
+   end
+end
+
+program test
+   use mod1
+   implicit none
+   call host_sub()
+end program test
\ No newline at end of file

Co-authored-by: Valentin Clement (バレンタイン クレメン) <[email protected]>
Copy link

github-actions bot commented Jun 12, 2024

✅ With the latest revision this PR passed the C/C++ code formatter.

Iman Hosseini added 2 commits June 11, 2024 18:30
Move test to /CUDA/ directory like other CUDA tests.
Copy link
Contributor

@clementval clementval left a comment

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

LGTM

@ImanHosseini ImanHosseini merged commit 7665d3d into llvm:main Jun 12, 2024
7 checks passed
@ImanHosseini ImanHosseini deleted the flang_cuf_reductions branch June 12, 2024 18:18
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment
Labels
flang:fir-hlfir flang Flang issues not falling into any other category
Projects
None yet
Development

Successfully merging this pull request may close these issues.

3 participants