Skip to content

Commit d2e3c77

Browse files
committed
[flang][OpenMP] Map simple do concurrent loops to OpenMP host constructs
Upstreams one more part of the ROCm `do concurrent` to OpenMP mapping pass. This PR add support for converting simple loops to the equivalent OpenMP constructs on the host: `omp parallel do`. Towards that end, we have to collect more information about loop nests for which we add new utils in the `looputils` name space.
1 parent ca40210 commit d2e3c77

File tree

6 files changed

+405
-19
lines changed

6 files changed

+405
-19
lines changed

flang/docs/DoConcurrentConversionToOpenMP.md

Lines changed: 47 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -126,6 +126,53 @@ see the "Data environment" section below.
126126
See `flang/test/Transforms/DoConcurrent/loop_nest_test.f90` for more examples
127127
of what is and is not detected as a perfect loop nest.
128128

129+
### Single-range loops
130+
131+
Given the following loop:
132+
```fortran
133+
do concurrent(i=1:n)
134+
a(i) = i * i
135+
end do
136+
```
137+
138+
#### Mapping to `host`
139+
140+
Mapping this loop to the `host`, generates MLIR operations of the following
141+
structure:
142+
143+
```
144+
%4 = fir.address_of(@_QFEa) ...
145+
%6:2 = hlfir.declare %4 ...
146+
147+
omp.parallel {
148+
// Allocate private copy for `i`.
149+
// TODO Use delayed privatization.
150+
%19 = fir.alloca i32 {bindc_name = "i"}
151+
%20:2 = hlfir.declare %19 {uniq_name = "_QFEi"} ...
152+
153+
omp.wsloop {
154+
omp.loop_nest (%arg0) : index = (%21) to (%22) inclusive step (%c1_2) {
155+
%23 = fir.convert %arg0 : (index) -> i32
156+
// Use the privatized version of `i`.
157+
fir.store %23 to %20#1 : !fir.ref<i32>
158+
...
159+
160+
// Use "shared" SSA value of `a`.
161+
%42 = hlfir.designate %6#0
162+
hlfir.assign %35 to %42
163+
...
164+
omp.yield
165+
}
166+
omp.terminator
167+
}
168+
omp.terminator
169+
}
170+
```
171+
172+
#### Mapping to `device`
173+
174+
<!-- TODO -->
175+
129176
<!--
130177
More details about current status will be added along with relevant parts of the
131178
implementation in later upstreaming patches.

flang/lib/Optimizer/OpenMP/DoConcurrentConversion.cpp

Lines changed: 201 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -11,6 +11,7 @@
1111
#include "flang/Optimizer/OpenMP/Utils.h"
1212
#include "mlir/Analysis/SliceAnalysis.h"
1313
#include "mlir/Dialect/OpenMP/OpenMPDialect.h"
14+
#include "mlir/IR/IRMapping.h"
1415
#include "mlir/Transforms/DialectConversion.h"
1516
#include "mlir/Transforms/RegionUtils.h"
1617

@@ -24,7 +25,82 @@ namespace flangomp {
2425

2526
namespace {
2627
namespace looputils {
27-
using LoopNest = llvm::SetVector<fir::DoLoopOp>;
28+
/// Stores info needed about the induction/iteration variable for each `do
29+
/// concurrent` in a loop nest.
30+
struct InductionVariableInfo {
31+
/// the operation allocating memory for iteration variable,
32+
mlir::Operation *iterVarMemDef;
33+
};
34+
35+
using LoopNestToIndVarMap =
36+
llvm::MapVector<fir::DoLoopOp, InductionVariableInfo>;
37+
38+
/// Given an operation `op`, this returns true if one of `op`'s operands is
39+
/// "ultimately" the loop's induction variable. This helps in cases where the
40+
/// induction variable's use is "hidden" behind a convert/cast.
41+
///
42+
/// For example, give the following loop:
43+
/// ```
44+
/// fir.do_loop %ind_var = %lb to %ub step %s unordered {
45+
/// %ind_var_conv = fir.convert %ind_var : (index) -> i32
46+
/// fir.store %ind_var_conv to %i#1 : !fir.ref<i32>
47+
/// ...
48+
/// }
49+
/// ```
50+
///
51+
/// If \p op is the `fir.store` operation, then this function will return true
52+
/// since the IV is the "ultimate" operand to the `fir.store` op through the
53+
/// `%ind_var_conv` -> `%ind_var` conversion sequence.
54+
///
55+
/// For why this is useful, see its use in `findLoopIndVarMemDecl`.
56+
bool isIndVarUltimateOperand(mlir::Operation *op, fir::DoLoopOp doLoop) {
57+
while (op != nullptr && op->getNumOperands() > 0) {
58+
auto ivIt = llvm::find_if(op->getOperands(), [&](mlir::Value operand) {
59+
return operand == doLoop.getInductionVar();
60+
});
61+
62+
if (ivIt != op->getOperands().end())
63+
return true;
64+
65+
op = op->getOperand(0).getDefiningOp();
66+
}
67+
68+
return false;
69+
}
70+
71+
/// For the \p doLoop parameter, find the operation that declares its iteration
72+
/// variable or allocates memory for it.
73+
///
74+
/// For example, give the following loop:
75+
/// ```
76+
/// ...
77+
/// %i:2 = hlfir.declare %0 {uniq_name = "_QFEi"} : ...
78+
/// ...
79+
/// fir.do_loop %ind_var = %lb to %ub step %s unordered {
80+
/// %ind_var_conv = fir.convert %ind_var : (index) -> i32
81+
/// fir.store %ind_var_conv to %i#1 : !fir.ref<i32>
82+
/// ...
83+
/// }
84+
/// ```
85+
///
86+
/// This function returns the `hlfir.declare` op for `%i`.
87+
mlir::Operation *findLoopIterationVarMemDecl(fir::DoLoopOp doLoop) {
88+
mlir::Value result = nullptr;
89+
mlir::visitUsedValuesDefinedAbove(
90+
doLoop.getRegion(), [&](mlir::OpOperand *operand) {
91+
if (result)
92+
return;
93+
94+
if (isIndVarUltimateOperand(operand->getOwner(), doLoop)) {
95+
assert(result == nullptr &&
96+
"loop can have only one induction variable");
97+
result = operand->get();
98+
}
99+
});
100+
101+
assert(result != nullptr && result.getDefiningOp() != nullptr);
102+
return result.getDefiningOp();
103+
}
28104

29105
/// Loop \p innerLoop is considered perfectly-nested inside \p outerLoop iff
30106
/// there are no operations in \p outerloop's body other than:
@@ -116,11 +192,14 @@ bool isPerfectlyNested(fir::DoLoopOp outerLoop, fir::DoLoopOp innerLoop) {
116192
/// fails to recognize a certain nested loop as part of the nest it just returns
117193
/// the parent loops it discovered before.
118194
mlir::LogicalResult collectLoopNest(fir::DoLoopOp currentLoop,
119-
LoopNest &loopNest) {
195+
LoopNestToIndVarMap &loopNest) {
120196
assert(currentLoop.getUnordered());
121197

122198
while (true) {
123-
loopNest.insert(currentLoop);
199+
loopNest.insert(
200+
{currentLoop,
201+
InductionVariableInfo{findLoopIterationVarMemDecl(currentLoop)}});
202+
124203
llvm::SmallVector<fir::DoLoopOp> unorderedLoops;
125204

126205
for (auto nestedLoop : currentLoop.getRegion().getOps<fir::DoLoopOp>())
@@ -152,26 +231,136 @@ class DoConcurrentConversion : public mlir::OpConversionPattern<fir::DoLoopOp> {
152231
public:
153232
using mlir::OpConversionPattern<fir::DoLoopOp>::OpConversionPattern;
154233

155-
DoConcurrentConversion(mlir::MLIRContext *context, bool mapToDevice)
156-
: OpConversionPattern(context), mapToDevice(mapToDevice) {}
234+
DoConcurrentConversion(mlir::MLIRContext *context, bool mapToDevice,
235+
llvm::DenseSet<fir::DoLoopOp> &concurrentLoopsToSkip)
236+
: OpConversionPattern(context), mapToDevice(mapToDevice),
237+
concurrentLoopsToSkip(concurrentLoopsToSkip) {}
157238

158239
mlir::LogicalResult
159240
matchAndRewrite(fir::DoLoopOp doLoop, OpAdaptor adaptor,
160241
mlir::ConversionPatternRewriter &rewriter) const override {
161-
looputils::LoopNest loopNest;
242+
looputils::LoopNestToIndVarMap loopNest;
162243
bool hasRemainingNestedLoops =
163244
failed(looputils::collectLoopNest(doLoop, loopNest));
164245
if (hasRemainingNestedLoops)
165246
mlir::emitWarning(doLoop.getLoc(),
166247
"Some `do concurent` loops are not perfectly-nested. "
167248
"These will be serialized.");
168249

169-
// TODO This will be filled in with the next PRs that upstreams the rest of
170-
// the ROCm implementaion.
250+
mlir::IRMapping mapper;
251+
genParallelOp(doLoop.getLoc(), rewriter, loopNest, mapper);
252+
mlir::omp::LoopNestOperands loopNestClauseOps;
253+
genLoopNestClauseOps(doLoop.getLoc(), rewriter, loopNest, mapper,
254+
loopNestClauseOps);
255+
256+
mlir::omp::LoopNestOp ompLoopNest =
257+
genWsLoopOp(rewriter, loopNest.back().first, mapper, loopNestClauseOps,
258+
/*isComposite=*/mapToDevice);
259+
260+
rewriter.eraseOp(doLoop);
261+
262+
// Mark `unordered` loops that are not perfectly nested to be skipped from
263+
// the legality check of the `ConversionTarget` since we are not interested
264+
// in mapping them to OpenMP.
265+
ompLoopNest->walk([&](fir::DoLoopOp doLoop) {
266+
if (doLoop.getUnordered()) {
267+
concurrentLoopsToSkip.insert(doLoop);
268+
}
269+
});
270+
171271
return mlir::success();
172272
}
173273

274+
private:
275+
mlir::omp::ParallelOp genParallelOp(mlir::Location loc,
276+
mlir::ConversionPatternRewriter &rewriter,
277+
looputils::LoopNestToIndVarMap &loopNest,
278+
mlir::IRMapping &mapper) const {
279+
auto parallelOp = rewriter.create<mlir::omp::ParallelOp>(loc);
280+
rewriter.createBlock(&parallelOp.getRegion());
281+
rewriter.setInsertionPoint(rewriter.create<mlir::omp::TerminatorOp>(loc));
282+
283+
genLoopNestIndVarAllocs(rewriter, loopNest, mapper);
284+
return parallelOp;
285+
}
286+
287+
void genLoopNestIndVarAllocs(mlir::ConversionPatternRewriter &rewriter,
288+
looputils::LoopNestToIndVarMap &loopNest,
289+
mlir::IRMapping &mapper) const {
290+
291+
for (auto &[_, indVarInfo] : loopNest)
292+
genInductionVariableAlloc(rewriter, indVarInfo.iterVarMemDef, mapper);
293+
}
294+
295+
mlir::Operation *
296+
genInductionVariableAlloc(mlir::ConversionPatternRewriter &rewriter,
297+
mlir::Operation *indVarMemDef,
298+
mlir::IRMapping &mapper) const {
299+
assert(
300+
indVarMemDef != nullptr &&
301+
"Induction variable memdef is expected to have a defining operation.");
302+
303+
llvm::SmallSetVector<mlir::Operation *, 2> indVarDeclareAndAlloc;
304+
for (auto operand : indVarMemDef->getOperands())
305+
indVarDeclareAndAlloc.insert(operand.getDefiningOp());
306+
indVarDeclareAndAlloc.insert(indVarMemDef);
307+
308+
mlir::Operation *result;
309+
for (mlir::Operation *opToClone : indVarDeclareAndAlloc)
310+
result = rewriter.clone(*opToClone, mapper);
311+
312+
return result;
313+
}
314+
315+
void genLoopNestClauseOps(
316+
mlir::Location loc, mlir::ConversionPatternRewriter &rewriter,
317+
looputils::LoopNestToIndVarMap &loopNest, mlir::IRMapping &mapper,
318+
mlir::omp::LoopNestOperands &loopNestClauseOps) const {
319+
assert(loopNestClauseOps.loopLowerBounds.empty() &&
320+
"Loop nest bounds were already emitted!");
321+
322+
auto populateBounds = [&](mlir::Value var,
323+
llvm::SmallVectorImpl<mlir::Value> &bounds) {
324+
bounds.push_back(var.getDefiningOp()->getResult(0));
325+
};
326+
327+
for (auto &[doLoop, _] : loopNest) {
328+
populateBounds(doLoop.getLowerBound(), loopNestClauseOps.loopLowerBounds);
329+
populateBounds(doLoop.getUpperBound(), loopNestClauseOps.loopUpperBounds);
330+
populateBounds(doLoop.getStep(), loopNestClauseOps.loopSteps);
331+
}
332+
333+
loopNestClauseOps.loopInclusive = rewriter.getUnitAttr();
334+
}
335+
336+
mlir::omp::LoopNestOp
337+
genWsLoopOp(mlir::ConversionPatternRewriter &rewriter, fir::DoLoopOp doLoop,
338+
mlir::IRMapping &mapper,
339+
const mlir::omp::LoopNestOperands &clauseOps,
340+
bool isComposite) const {
341+
342+
auto wsloopOp = rewriter.create<mlir::omp::WsloopOp>(doLoop.getLoc());
343+
wsloopOp.setComposite(isComposite);
344+
rewriter.createBlock(&wsloopOp.getRegion());
345+
346+
auto loopNestOp =
347+
rewriter.create<mlir::omp::LoopNestOp>(doLoop.getLoc(), clauseOps);
348+
349+
// Clone the loop's body inside the loop nest construct using the
350+
// mapped values.
351+
rewriter.cloneRegionBefore(doLoop.getRegion(), loopNestOp.getRegion(),
352+
loopNestOp.getRegion().begin(), mapper);
353+
354+
mlir::Operation *terminator = loopNestOp.getRegion().back().getTerminator();
355+
rewriter.setInsertionPointToEnd(&loopNestOp.getRegion().back());
356+
rewriter.create<mlir::omp::YieldOp>(terminator->getLoc());
357+
rewriter.eraseOp(terminator);
358+
359+
return loopNestOp;
360+
}
361+
174362
bool mapToDevice;
363+
llvm::DenseSet<fir::DoLoopOp> &concurrentLoopsToSkip;
175364
};
176365

177366
class DoConcurrentConversionPass
@@ -200,16 +389,18 @@ class DoConcurrentConversionPass
200389
return;
201390
}
202391

392+
llvm::DenseSet<fir::DoLoopOp> concurrentLoopsToSkip;
203393
mlir::RewritePatternSet patterns(context);
204394
patterns.insert<DoConcurrentConversion>(
205-
context, mapTo == flangomp::DoConcurrentMappingKind::DCMK_Device);
395+
context, mapTo == flangomp::DoConcurrentMappingKind::DCMK_Device,
396+
concurrentLoopsToSkip);
206397
mlir::ConversionTarget target(*context);
207398
target.addDynamicallyLegalOp<fir::DoLoopOp>([&](fir::DoLoopOp op) {
208399
// The goal is to handle constructs that eventually get lowered to
209400
// `fir.do_loop` with the `unordered` attribute (e.g. array expressions).
210401
// Currently, this is only enabled for the `do concurrent` construct since
211402
// the pass runs early in the pipeline.
212-
return !op.getUnordered();
403+
return !op.getUnordered() || concurrentLoopsToSkip.contains(op);
213404
});
214405
target.markUnknownOpDynamicallyLegal(
215406
[](mlir::Operation *) { return true; });

flang/test/Transforms/DoConcurrent/basic_host.f90

Lines changed: 5 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,3 @@
1-
! Mark as xfail for now until we upstream the relevant part. This is just for
2-
! demo purposes at this point. Upstreaming this is the next step.
3-
! XFAIL: *
4-
51
! Tests mapping of a basic `do concurrent` loop to `!$omp parallel do`.
62

73
! RUN: %flang_fc1 -emit-hlfir -fopenmp -fdo-concurrent-to-openmp=host %s -o - \
@@ -19,17 +15,17 @@ program do_concurrent_basic
1915

2016
! CHECK-NOT: fir.do_loop
2117

22-
! CHECK: omp.parallel {
23-
24-
! CHECK-NEXT: %[[ITER_VAR:.*]] = fir.alloca i32 {bindc_name = "i"}
25-
! CHECK-NEXT: %[[BINDING:.*]]:2 = hlfir.declare %[[ITER_VAR]] {uniq_name = "_QFEi"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
26-
2718
! CHECK: %[[C1:.*]] = arith.constant 1 : i32
2819
! CHECK: %[[LB:.*]] = fir.convert %[[C1]] : (i32) -> index
2920
! CHECK: %[[C10:.*]] = arith.constant 10 : i32
3021
! CHECK: %[[UB:.*]] = fir.convert %[[C10]] : (i32) -> index
3122
! CHECK: %[[STEP:.*]] = arith.constant 1 : index
3223

24+
! CHECK: omp.parallel {
25+
26+
! CHECK-NEXT: %[[ITER_VAR:.*]] = fir.alloca i32 {bindc_name = "i"}
27+
! CHECK-NEXT: %[[BINDING:.*]]:2 = hlfir.declare %[[ITER_VAR]] {uniq_name = "_QFEi"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
28+
3329
! CHECK: omp.wsloop {
3430
! CHECK-NEXT: omp.loop_nest (%[[ARG0:.*]]) : index = (%[[LB]]) to (%[[UB]]) inclusive step (%[[STEP]]) {
3531
! CHECK-NEXT: %[[IV_IDX:.*]] = fir.convert %[[ARG0]] : (index) -> i32

0 commit comments

Comments
 (0)