Skip to content

[flang] Add reductions for CUF Kernels: Lowering #95184

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 6 commits into from
Jun 12, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
23 changes: 22 additions & 1 deletion flang/include/flang/Optimizer/Dialect/CUF/CUFOps.td
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,7 @@
include "flang/Optimizer/Dialect/CUF/CUFDialect.td"
include "flang/Optimizer/Dialect/CUF/Attributes/CUFAttr.td"
include "flang/Optimizer/Dialect/FIRTypes.td"
include "flang/Optimizer/Dialect/FIRAttr.td"
include "mlir/Interfaces/LoopLikeInterface.td"
include "mlir/IR/BuiltinAttributes.td"

Expand Down Expand Up @@ -249,7 +250,9 @@ def cuf_KernelOp : cuf_Op<"kernel", [AttrSizedOperandSegments,
Variadic<Index>:$lowerbound,
Variadic<Index>:$upperbound,
Variadic<Index>:$step,
OptionalAttr<I64Attr>:$n
OptionalAttr<I64Attr>:$n,
Variadic<AnyType>:$reduceOperands,
OptionalAttr<ArrayAttr>:$reduceAttrs
);

let regions = (region AnyRegion:$region);
Expand All @@ -258,11 +261,29 @@ def cuf_KernelOp : cuf_Op<"kernel", [AttrSizedOperandSegments,
`<` `<` `<` custom<CUFKernelValues>($grid, type($grid)) `,`
custom<CUFKernelValues>($block, type($block))
( `,` `stream` `=` $stream^ )? `>` `>` `>`
( `reduce` `(` $reduceOperands^ `:` type($reduceOperands) `:` $reduceAttrs `)` )?
custom<CUFKernelLoopControl>($region, $lowerbound, type($lowerbound),
$upperbound, type($upperbound), $step, type($step))
attr-dict
}];

let extraClassDeclaration = [{
/// Get Number of variadic operands
unsigned getNumOperands(unsigned idx) {
auto segments = (*this)->getAttrOfType<mlir::DenseI32ArrayAttr>(
getOperandSegmentSizeAttr());
return static_cast<unsigned>(segments[idx]);
}
// Get Number of reduction operands
unsigned getNumReduceOperands() {
return getNumOperands(7);
}
/// Does the operation hold operands for reduction variables
bool hasReduceOperands() {
return getNumReduceOperands() > 0;
}
}];

let hasVerifier = 1;
}

Expand Down
34 changes: 32 additions & 2 deletions flang/lib/Lower/Bridge.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -2669,6 +2669,35 @@ class FirConverter : public Fortran::lower::AbstractConverter {
std::get<2>(dir.t);
const std::optional<Fortran::parser::ScalarIntExpr> &stream =
std::get<3>(dir.t);
const std::list<Fortran::parser::CUFReduction> &cufreds =
std::get<4>(dir.t);

llvm::SmallVector<mlir::Value> reduceOperands;
llvm::SmallVector<mlir::Attribute> reduceAttrs;

for (const Fortran::parser::CUFReduction &cufred : cufreds) {
fir::ReduceOperationEnum redOpEnum = getReduceOperationEnum(
std::get<Fortran::parser::ReductionOperator>(cufred.t));
const std::list<Fortran::parser::Scalar<Fortran::parser::Variable>>
&scalarvars = std::get<1>(cufred.t);
for (const Fortran::parser::Scalar<Fortran::parser::Variable> &scalarvar :
scalarvars) {
auto reduce_attr =
fir::ReduceAttr::get(builder->getContext(), redOpEnum);
reduceAttrs.push_back(reduce_attr);
const Fortran::parser::Variable &var = scalarvar.thing;
if (const auto *iDesignator = std::get_if<
Fortran::common::Indirection<Fortran::parser::Designator>>(
&var.u)) {
const Fortran::parser::Designator &designator = iDesignator->value();
if (const auto *name =
Fortran::semantics::getDesignatorNameIfDataRef(designator)) {
auto val = getSymbolAddress(*name->symbol);
reduceOperands.push_back(val);
}
}
}
}

auto isOnlyStars =
[&](const std::list<Fortran::parser::CUFKernelDoConstruct::StarOrExpr>
Expand Down Expand Up @@ -2771,8 +2800,9 @@ class FirConverter : public Fortran::lower::AbstractConverter {
loopEval = &*std::next(loopEval->getNestedEvaluations().begin());
}

auto op = builder->create<cuf::KernelOp>(loc, gridValues, blockValues,
streamValue, lbs, ubs, steps, n);
auto op = builder->create<cuf::KernelOp>(
loc, gridValues, blockValues, streamValue, lbs, ubs, steps, n,
mlir::ValueRange(reduceOperands), builder->getArrayAttr(reduceAttrs));
builder->createBlock(&op.getRegion(), op.getRegion().end(), ivTypes,
ivLocs);
mlir::Block &b = op.getRegion().back();
Expand Down
13 changes: 12 additions & 1 deletion flang/lib/Optimizer/Dialect/CUF/CUFOps.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,7 @@
#include "flang/Optimizer/Dialect/CUF/CUFOps.h"
#include "flang/Optimizer/Dialect/CUF/Attributes/CUFAttr.h"
#include "flang/Optimizer/Dialect/CUF/CUFDialect.h"
#include "flang/Optimizer/Dialect/FIRAttr.h"
#include "flang/Optimizer/Dialect/FIRType.h"
#include "mlir/IR/Attributes.h"
#include "mlir/IR/BuiltinAttributes.h"
Expand Down Expand Up @@ -227,7 +228,17 @@ mlir::LogicalResult cuf::KernelOp::verify() {
getLowerbound().size() != getStep().size())
return emitOpError(
"expect same number of values in lowerbound, upperbound and step");

auto reduceAttrs = getReduceAttrs();
std::size_t reduceAttrsSize = reduceAttrs ? reduceAttrs->size() : 0;
if (getReduceOperands().size() != reduceAttrsSize)
return emitOpError("expect same number of values in reduce operands and "
"reduce attributes");
if (reduceAttrs) {
for (const auto &attr : reduceAttrs.value()) {
if (!mlir::isa<fir::ReduceAttr>(attr))
return emitOpError("expect reduce attributes to be ReduceAttr");
}
}
return mlir::success();
}

Expand Down
37 changes: 37 additions & 0 deletions flang/test/Lower/CUDA/cuda-kernel-do-reduction.cuf
Original file line number Diff line number Diff line change
@@ -0,0 +1,37 @@
! Test CUDA Fortran kernel do reduction
! RUN: bbc -emit-fir -fcuda -o - %s | FileCheck %s

module mod1
contains
subroutine host_sub()
integer, parameter :: asize = 4
integer, device :: adev(asize)
integer :: ahost(asize)
integer :: q
integer, device :: add_reduce_var
integer, device :: mul_reduce_var
! CHECK: %[[VAL_0:.*]] = fir.declare %{{.*}} {data_attr = #cuf.cuda<device>, uniq_name = "_QMmod1Fhost_subEadd_reduce_var"} : (!fir.ref<i32>) -> !fir.ref<i32>
! CHECK: %[[VAL_1:.*]] = fir.declare %{{.*}} {data_attr = #cuf.cuda<device>, uniq_name = "_QMmod1Fhost_subEmul_reduce_var"} : (!fir.ref<i32>) -> !fir.ref<i32>
do i = 1, asize
ahost(i) = i
enddo
adev = ahost
add_reduce_var = 0.0
mul_reduce_var = 1.0
! CHECK: {{.*}} reduce(%[[VAL_0:.*]], %[[VAL_1:.*]] : !fir.ref<i32>, !fir.ref<i32> : [#fir.reduce_attr<add>, #fir.reduce_attr<multiply>]) {{.*}}
!$cuf kernel do <<< *, * >>> reduce(+:add_reduce_var) reduce(*:mul_reduce_var)
do i = 1, asize
add_reduce_var = add_reduce_var + adev(i)
mul_reduce_var = mul_reduce_var * adev(i)
end do
q = rsum
ahost = adev
print *, q
end
end

program test
use mod1
implicit none
call host_sub()
end program test
Loading