Skip to content

[flang][fir] Lower do concurrent loop nests to fir.do_concurrent #132904

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 7 commits into from
Apr 16, 2025
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
228 changes: 136 additions & 92 deletions flang/lib/Lower/Bridge.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -94,10 +94,11 @@ struct IncrementLoopInfo {
template <typename T>
explicit IncrementLoopInfo(Fortran::semantics::Symbol &sym, const T &lower,
const T &upper, const std::optional<T> &step,
bool isUnordered = false)
bool isConcurrent = false)
: loopVariableSym{&sym}, lowerExpr{Fortran::semantics::GetExpr(lower)},
upperExpr{Fortran::semantics::GetExpr(upper)},
stepExpr{Fortran::semantics::GetExpr(step)}, isUnordered{isUnordered} {}
stepExpr{Fortran::semantics::GetExpr(step)},
isConcurrent{isConcurrent} {}

IncrementLoopInfo(IncrementLoopInfo &&) = default;
IncrementLoopInfo &operator=(IncrementLoopInfo &&x) = default;
Expand All @@ -120,7 +121,7 @@ struct IncrementLoopInfo {
const Fortran::lower::SomeExpr *upperExpr;
const Fortran::lower::SomeExpr *stepExpr;
const Fortran::lower::SomeExpr *maskExpr = nullptr;
bool isUnordered; // do concurrent, forall
bool isConcurrent;
llvm::SmallVector<const Fortran::semantics::Symbol *> localSymList;
llvm::SmallVector<const Fortran::semantics::Symbol *> localInitSymList;
llvm::SmallVector<
Expand All @@ -130,7 +131,7 @@ struct IncrementLoopInfo {
mlir::Value loopVariable = nullptr;

// Data members for structured loops.
fir::DoLoopOp doLoop = nullptr;
mlir::Operation *loopOp = nullptr;

// Data members for unstructured loops.
bool hasRealControl = false;
Expand Down Expand Up @@ -1980,7 +1981,7 @@ class FirConverter : public Fortran::lower::AbstractConverter {
llvm_unreachable("illegal reduction operator");
}

/// Collect DO CONCURRENT or FORALL loop control information.
/// Collect DO CONCURRENT loop control information.
IncrementLoopNestInfo getConcurrentControl(
const Fortran::parser::ConcurrentHeader &header,
const std::list<Fortran::parser::LocalitySpec> &localityList = {}) {
Expand Down Expand Up @@ -2291,8 +2292,14 @@ class FirConverter : public Fortran::lower::AbstractConverter {
mlir::LLVM::LoopAnnotationAttr la = mlir::LLVM::LoopAnnotationAttr::get(
builder->getContext(), {}, /*vectorize=*/va, {}, /*unroll*/ ua,
/*unroll_and_jam*/ uja, {}, {}, {}, {}, {}, {}, {}, {}, {}, {});
if (has_attrs)
info.doLoop.setLoopAnnotationAttr(la);
if (has_attrs) {
if (auto loopOp = mlir::dyn_cast<fir::DoLoopOp>(info.loopOp))
loopOp.setLoopAnnotationAttr(la);

if (auto doConcurrentOp =
mlir::dyn_cast<fir::DoConcurrentLoopOp>(info.loopOp))
doConcurrentOp.setLoopAnnotationAttr(la);
}
}

/// Generate FIR to begin a structured or unstructured increment loop nest.
Expand All @@ -2301,96 +2308,77 @@ class FirConverter : public Fortran::lower::AbstractConverter {
llvm::SmallVectorImpl<const Fortran::parser::CompilerDirective *> &dirs) {
assert(!incrementLoopNestInfo.empty() && "empty loop nest");
mlir::Location loc = toLocation();
mlir::Operation *boundsAndStepIP = nullptr;
mlir::arith::IntegerOverflowFlags iofBackup{};

llvm::SmallVector<mlir::Value> nestLBs;
llvm::SmallVector<mlir::Value> nestUBs;
llvm::SmallVector<mlir::Value> nestSts;
llvm::SmallVector<mlir::Value> nestReduceOperands;
llvm::SmallVector<mlir::Attribute> nestReduceAttrs;
bool genDoConcurrent = false;

for (IncrementLoopInfo &info : incrementLoopNestInfo) {
mlir::Value lowerValue;
mlir::Value upperValue;
mlir::Value stepValue;
genDoConcurrent = info.isStructured() && info.isConcurrent;

{
mlir::OpBuilder::InsertionGuard guard(*builder);
if (!genDoConcurrent)
info.loopVariable = genLoopVariableAddress(loc, *info.loopVariableSym,
info.isConcurrent);

// Set the IP before the first loop in the nest so that all nest bounds
// and step values are created outside the nest.
if (boundsAndStepIP)
builder->setInsertionPointAfter(boundsAndStepIP);
if (!getLoweringOptions().getIntegerWrapAround()) {
iofBackup = builder->getIntegerOverflowFlags();
builder->setIntegerOverflowFlags(
mlir::arith::IntegerOverflowFlags::nsw);
}

info.loopVariable = genLoopVariableAddress(loc, *info.loopVariableSym,
info.isUnordered);
if (!getLoweringOptions().getIntegerWrapAround()) {
iofBackup = builder->getIntegerOverflowFlags();
builder->setIntegerOverflowFlags(
mlir::arith::IntegerOverflowFlags::nsw);
}
lowerValue = genControlValue(info.lowerExpr, info);
upperValue = genControlValue(info.upperExpr, info);
bool isConst = true;
stepValue = genControlValue(info.stepExpr, info,
info.isStructured() ? nullptr : &isConst);
if (!getLoweringOptions().getIntegerWrapAround())
builder->setIntegerOverflowFlags(iofBackup);
boundsAndStepIP = stepValue.getDefiningOp();

// Use a temp variable for unstructured loops with non-const step.
if (!isConst) {
info.stepVariable =
builder->createTemporary(loc, stepValue.getType());
boundsAndStepIP =
builder->create<fir::StoreOp>(loc, stepValue, info.stepVariable);
nestLBs.push_back(genControlValue(info.lowerExpr, info));
nestUBs.push_back(genControlValue(info.upperExpr, info));
bool isConst = true;
nestSts.push_back(genControlValue(
info.stepExpr, info, info.isStructured() ? nullptr : &isConst));

if (!getLoweringOptions().getIntegerWrapAround())
builder->setIntegerOverflowFlags(iofBackup);

// Use a temp variable for unstructured loops with non-const step.
if (!isConst) {
mlir::Value stepValue = nestSts.back();
info.stepVariable = builder->createTemporary(loc, stepValue.getType());
builder->create<fir::StoreOp>(loc, stepValue, info.stepVariable);
}

if (genDoConcurrent && nestReduceOperands.empty()) {
// Create DO CONCURRENT reduce operands and attributes
for (const auto &reduceSym : info.reduceSymList) {
const fir::ReduceOperationEnum reduceOperation = reduceSym.first;
const Fortran::semantics::Symbol *sym = reduceSym.second;
fir::ExtendedValue exv = getSymbolExtendedValue(*sym, nullptr);
nestReduceOperands.push_back(fir::getBase(exv));
auto reduceAttr =
fir::ReduceAttr::get(builder->getContext(), reduceOperation);
nestReduceAttrs.push_back(reduceAttr);
}
}
}

for (auto [info, lowerValue, upperValue, stepValue] :
llvm::zip_equal(incrementLoopNestInfo, nestLBs, nestUBs, nestSts)) {
// Structured loop - generate fir.do_loop.
if (info.isStructured()) {
if (genDoConcurrent)
continue;

// The loop variable is a doLoop op argument.
mlir::Type loopVarType = info.getLoopVariableType();
mlir::Value loopValue;
if (info.isUnordered) {
llvm::SmallVector<mlir::Value> reduceOperands;
llvm::SmallVector<mlir::Attribute> reduceAttrs;
// Create DO CONCURRENT reduce operands and attributes
for (const auto &reduceSym : info.reduceSymList) {
const fir::ReduceOperationEnum reduce_operation = reduceSym.first;
const Fortran::semantics::Symbol *sym = reduceSym.second;
fir::ExtendedValue exv = getSymbolExtendedValue(*sym, nullptr);
reduceOperands.push_back(fir::getBase(exv));
auto reduce_attr =
fir::ReduceAttr::get(builder->getContext(), reduce_operation);
reduceAttrs.push_back(reduce_attr);
}
// The loop variable value is explicitly updated.
info.doLoop = builder->create<fir::DoLoopOp>(
loc, lowerValue, upperValue, stepValue, /*unordered=*/true,
/*finalCountValue=*/false, /*iterArgs=*/std::nullopt,
llvm::ArrayRef<mlir::Value>(reduceOperands), reduceAttrs);
builder->setInsertionPointToStart(info.doLoop.getBody());
loopValue = builder->createConvert(loc, loopVarType,
info.doLoop.getInductionVar());
} else {
// The loop variable is a doLoop op argument.
info.doLoop = builder->create<fir::DoLoopOp>(
loc, lowerValue, upperValue, stepValue, /*unordered=*/false,
/*finalCountValue=*/true,
builder->createConvert(loc, loopVarType, lowerValue));
builder->setInsertionPointToStart(info.doLoop.getBody());
loopValue = info.doLoop.getRegionIterArgs()[0];
}
auto loopOp = builder->create<fir::DoLoopOp>(
loc, lowerValue, upperValue, stepValue, /*unordered=*/false,
/*finalCountValue=*/true,
builder->createConvert(loc, loopVarType, lowerValue));
info.loopOp = loopOp;
builder->setInsertionPointToStart(loopOp.getBody());
mlir::Value loopValue = loopOp.getRegionIterArgs()[0];

// Update the loop variable value in case it has non-index references.
builder->create<fir::StoreOp>(loc, loopValue, info.loopVariable);
if (info.maskExpr) {
Fortran::lower::StatementContext stmtCtx;
mlir::Value maskCond = createFIRExpr(loc, info.maskExpr, stmtCtx);
stmtCtx.finalizeAndReset();
mlir::Value maskCondCast =
builder->createConvert(loc, builder->getI1Type(), maskCond);
auto ifOp = builder->create<fir::IfOp>(loc, maskCondCast,
/*withElseRegion=*/false);
builder->setInsertionPointToStart(&ifOp.getThenRegion().front());
}
if (info.hasLocalitySpecs())
handleLocalitySpecs(info);

addLoopAnnotationAttr(info, dirs);
continue;
}
Expand Down Expand Up @@ -2454,6 +2442,60 @@ class FirConverter : public Fortran::lower::AbstractConverter {
builder->restoreInsertionPoint(insertPt);
}
}

if (genDoConcurrent) {
auto loopWrapperOp = builder->create<fir::DoConcurrentOp>(loc);
builder->setInsertionPointToStart(
builder->createBlock(&loopWrapperOp.getRegion()));

for (IncrementLoopInfo &info : llvm::reverse(incrementLoopNestInfo)) {
info.loopVariable = genLoopVariableAddress(loc, *info.loopVariableSym,
info.isConcurrent);
}

builder->setInsertionPointToEnd(loopWrapperOp.getBody());
auto loopOp = builder->create<fir::DoConcurrentLoopOp>(
loc, nestLBs, nestUBs, nestSts, nestReduceOperands,
nestReduceAttrs.empty()
? nullptr
: mlir::ArrayAttr::get(builder->getContext(), nestReduceAttrs),
nullptr);

llvm::SmallVector<mlir::Type> loopBlockArgTypes(
incrementLoopNestInfo.size(), builder->getIndexType());
llvm::SmallVector<mlir::Location> loopBlockArgLocs(
incrementLoopNestInfo.size(), loc);
mlir::Region &loopRegion = loopOp.getRegion();
mlir::Block *loopBlock = builder->createBlock(
&loopRegion, loopRegion.begin(), loopBlockArgTypes, loopBlockArgLocs);
builder->setInsertionPointToStart(loopBlock);

for (auto [info, blockArg] :
llvm::zip_equal(incrementLoopNestInfo, loopBlock->getArguments())) {
info.loopOp = loopOp;
mlir::Value loopValue =
builder->createConvert(loc, info.getLoopVariableType(), blockArg);
builder->create<fir::StoreOp>(loc, loopValue, info.loopVariable);

if (info.maskExpr) {
Fortran::lower::StatementContext stmtCtx;
mlir::Value maskCond = createFIRExpr(loc, info.maskExpr, stmtCtx);
stmtCtx.finalizeAndReset();
mlir::Value maskCondCast =
builder->createConvert(loc, builder->getI1Type(), maskCond);
auto ifOp = builder->create<fir::IfOp>(loc, maskCondCast,
/*withElseRegion=*/false);
builder->setInsertionPointToStart(&ifOp.getThenRegion().front());
}
}

IncrementLoopInfo &innermostInfo = incrementLoopNestInfo.back();

if (innermostInfo.hasLocalitySpecs())
handleLocalitySpecs(innermostInfo);

addLoopAnnotationAttr(innermostInfo, dirs);
}
}

/// Generate FIR to end a structured or unstructured increment loop nest.
Expand All @@ -2470,29 +2512,31 @@ class FirConverter : public Fortran::lower::AbstractConverter {
it != rend; ++it) {
IncrementLoopInfo &info = *it;
if (info.isStructured()) {
// End fir.do_loop.
if (info.isUnordered) {
builder->setInsertionPointAfter(info.doLoop);
// End fir.do_concurent.loop.
if (info.isConcurrent) {
builder->setInsertionPointAfter(info.loopOp->getParentOp());
continue;
}

// End fir.do_loop.
// Decrement tripVariable.
builder->setInsertionPointToEnd(info.doLoop.getBody());
auto doLoopOp = mlir::cast<fir::DoLoopOp>(info.loopOp);
builder->setInsertionPointToEnd(doLoopOp.getBody());
llvm::SmallVector<mlir::Value, 2> results;
results.push_back(builder->create<mlir::arith::AddIOp>(
loc, info.doLoop.getInductionVar(), info.doLoop.getStep(),
iofAttr));
loc, doLoopOp.getInductionVar(), doLoopOp.getStep(), iofAttr));
// Step loopVariable to help optimizations such as vectorization.
// Induction variable elimination will clean up as necessary.
mlir::Value step = builder->createConvert(
loc, info.getLoopVariableType(), info.doLoop.getStep());
loc, info.getLoopVariableType(), doLoopOp.getStep());
mlir::Value loopVar =
builder->create<fir::LoadOp>(loc, info.loopVariable);
results.push_back(
builder->create<mlir::arith::AddIOp>(loc, loopVar, step, iofAttr));
builder->create<fir::ResultOp>(loc, results);
builder->setInsertionPointAfter(info.doLoop);
builder->setInsertionPointAfter(doLoopOp);
// The loop control variable may be used after the loop.
builder->create<fir::StoreOp>(loc, info.doLoop.getResult(1),
builder->create<fir::StoreOp>(loc, doLoopOp.getResult(1),
info.loopVariable);
continue;
}
Expand Down
3 changes: 3 additions & 0 deletions flang/lib/Optimizer/Builder/FIRBuilder.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -280,6 +280,9 @@ mlir::Block *fir::FirOpBuilder::getAllocaBlock() {
if (auto cufKernelOp = getRegion().getParentOfType<cuf::KernelOp>())
return &cufKernelOp.getRegion().front();

if (auto doConcurentOp = getRegion().getParentOfType<fir::DoConcurrentOp>())
return doConcurentOp.getBody();
Comment on lines +283 to +284
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Do we need this for any allocation in the do concurrent? We did not have this so far so I'm just wondering.

Copy link
Member Author

@ergawy ergawy Mar 27, 2025

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

We do it this way for the new fir.do_concurrent and fir.do_concurrent.loop ops to keep all info related to the loop nest closely located in the IR. The allocations needed here are for the nest iteration variables. See: https://discourse.llvm.org/t/modeling-do-concurrent-loops-in-the-fir-dialect/84950/6?u=bob.belcher and related discussion and https://github.com/llvm/llvm-project/blob/main/flang/include/flang/Optimizer/Dialect/FIROps.td#L3457.

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Ok for the iteration variables but what about any temporary that will be allocated during the body lowering. Is it ok to add them at the same place as the iteration variables?

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

These will be emitted in the body of the loop itself (for every iteration), just like what would happen now when targeting the old fir.do_loop ... unorderd nest. I think it would be ok to move them to the alloca block only if we serialize to fir.do_loop ... unordered nest (as we do in DoConcurrentConversion in SimplifyFIROperations.cpp) or if we are targeting OpenMP, for example, we can share them for each chunk of the parallel loop that we map to. But in the general case, I think it would not be ok to move them out at this point.

Let me know if I misunderstood what you meant.

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I don't think they are created in the loop body. The code calls getAllocaBlock() and currently, it will probably return the entry block of the function. With your change it will return the do concurrent body. So instead of having the alloca for temporary in the function entry block you will have them in the do concurrent. If this is fine for you then it's ok but this change from the current behavior.

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

You are right, I got confused here. This is still fine since the temp allocations will be emitted in the fir.do_concurrent wrapper op not the fir.do_concurrent.loop op. When serialized (i.e. converted to fir.do_loop ... unordered nest), these allocations are moved to the alloca block (see https://github.com/llvm/llvm-project/blob/main/flang/lib/Optimizer/Transforms/SimplifyFIROperations.cpp#L167-L176). For OpenMP, I call these "loop-local values", these values are privatized by the do concurrent to OpenMP pass (see #127635).


return getEntryBlock();
}

Expand Down
39 changes: 32 additions & 7 deletions flang/test/Lower/do_concurrent.f90
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,9 @@ subroutine sub1(n)
implicit none
integer :: n, m, i, j, k
integer, dimension(n) :: a
!CHECK: %[[N_DECL:.*]]:2 = hlfir.declare %{{.*}} dummy_scope %{{.*}} {uniq_name = "_QFsub1En"}
!CHECK: %[[A_DECL:.*]]:2 = hlfir.declare %{{.*}}(%{{.*}}) {uniq_name = "_QFsub1Ea"}

!CHECK: %[[LB1:.*]] = arith.constant 1 : i32
!CHECK: %[[LB1_CVT:.*]] = fir.convert %[[LB1]] : (i32) -> index
!CHECK: %[[UB1:.*]] = fir.load %{{.*}}#0 : !fir.ref<i32>
Expand All @@ -29,10 +32,30 @@ subroutine sub1(n)
!CHECK: %[[UB3:.*]] = arith.constant 10 : i32
!CHECK: %[[UB3_CVT:.*]] = fir.convert %[[UB3]] : (i32) -> index

!CHECK: fir.do_loop %{{.*}} = %[[LB1_CVT]] to %[[UB1_CVT]] step %{{.*}} unordered
!CHECK: fir.do_loop %{{.*}} = %[[LB2_CVT]] to %[[UB2_CVT]] step %{{.*}} unordered
!CHECK: fir.do_loop %{{.*}} = %[[LB3_CVT]] to %[[UB3_CVT]] step %{{.*}} unordered
!CHECK: fir.do_concurrent
!CHECK: %[[I:.*]] = fir.alloca i32 {bindc_name = "i"}
!CHECK: %[[I_DECL:.*]]:2 = hlfir.declare %[[I]]
!CHECK: %[[J:.*]] = fir.alloca i32 {bindc_name = "j"}
!CHECK: %[[J_DECL:.*]]:2 = hlfir.declare %[[J]]
!CHECK: %[[K:.*]] = fir.alloca i32 {bindc_name = "k"}
!CHECK: %[[K_DECL:.*]]:2 = hlfir.declare %[[K]]

!CHECK: fir.do_concurrent.loop (%[[I_IV:.*]], %[[J_IV:.*]], %[[K_IV:.*]]) =
!CHECK-SAME: (%[[LB1_CVT]], %[[LB2_CVT]], %[[LB3_CVT]]) to
!CHECK-SAME: (%[[UB1_CVT]], %[[UB2_CVT]], %[[UB3_CVT]]) step
!CHECK-SAME: (%{{.*}}, %{{.*}}, %{{.*}}) {
!CHECK: %[[I_IV_CVT:.*]] = fir.convert %[[I_IV]] : (index) -> i32
!CHECK: fir.store %[[I_IV_CVT]] to %[[I_DECL]]#0 : !fir.ref<i32>
!CHECK: %[[J_IV_CVT:.*]] = fir.convert %[[J_IV]] : (index) -> i32
!CHECK: fir.store %[[J_IV_CVT]] to %[[J_DECL]]#0 : !fir.ref<i32>
!CHECK: %[[K_IV_CVT:.*]] = fir.convert %[[K_IV]] : (index) -> i32
!CHECK: fir.store %[[K_IV_CVT]] to %[[K_DECL]]#0 : !fir.ref<i32>

!CHECK: %[[N_VAL:.*]] = fir.load %[[N_DECL]]#0 : !fir.ref<i32>
!CHECK: %[[I_VAL:.*]] = fir.load %[[I_DECL]]#0 : !fir.ref<i32>
!CHECK: %[[I_VAL_CVT:.*]] = fir.convert %[[I_VAL]] : (i32) -> i64
!CHECK: %[[A_ELEM:.*]] = hlfir.designate %[[A_DECL]]#0 (%[[I_VAL_CVT]])
!CHECK: hlfir.assign %[[N_VAL]] to %[[A_ELEM]] : i32, !fir.ref<i32>
do concurrent(i=1:n, j=1:bar(n*m, n/m), k=5:10)
a(i) = n
end do
Expand All @@ -45,22 +68,24 @@ subroutine sub2(n)
integer, dimension(n) :: a
!CHECK: %[[LB1:.*]] = arith.constant 1 : i32
!CHECK: %[[LB1_CVT:.*]] = fir.convert %[[LB1]] : (i32) -> index
!CHECK: %[[UB1:.*]] = fir.load %5#0 : !fir.ref<i32>
!CHECK: %[[UB1:.*]] = fir.load %{{.*}}#0 : !fir.ref<i32>
!CHECK: %[[UB1_CVT:.*]] = fir.convert %[[UB1]] : (i32) -> index
!CHECK: fir.do_loop %{{.*}} = %[[LB1_CVT]] to %[[UB1_CVT]] step %{{.*}} unordered
!CHECK: fir.do_concurrent
!CHECK: fir.do_concurrent.loop (%{{.*}}) = (%[[LB1_CVT]]) to (%[[UB1_CVT]]) step (%{{.*}})

!CHECK: %[[LB2:.*]] = arith.constant 1 : i32
!CHECK: %[[LB2_CVT:.*]] = fir.convert %[[LB2]] : (i32) -> index
!CHECK: %[[UB2:.*]] = fir.call @_QPbar(%{{.*}}, %{{.*}}) proc_attrs<pure> fastmath<contract> : (!fir.ref<i32>, !fir.ref<i32>) -> i32
!CHECK: %[[UB2_CVT:.*]] = fir.convert %[[UB2]] : (i32) -> index
!CHECK: fir.do_loop %{{.*}} = %[[LB2_CVT]] to %[[UB2_CVT]] step %{{.*}} unordered
!CHECK: fir.do_concurrent
!CHECK: fir.do_concurrent.loop (%{{.*}}) = (%[[LB2_CVT]]) to (%[[UB2_CVT]]) step (%{{.*}})
do concurrent(i=1:n)
do concurrent(j=1:bar(n*m, n/m))
a(i) = n
end do
end do
end subroutine


!CHECK-LABEL: unstructured
subroutine unstructured(inner_step)
integer(4) :: i, j, inner_step
Expand Down
Loading
Loading