Skip to content

Commit 28b3143

Browse files
committed
[flang][OpenMP] Map simple do concurrent loops to OpenMP host constructs (llvm#127633)
Upstreams one more part of the ROCm `do concurrent` to OpenMP mapping pass. This PR add support for converting simple loops to the equivalent OpenMP constructs on the host: `omp parallel do`. Towards that end, we have to collect more information about loop nests for which we add new utils in the `looputils` name space. PR stack: - llvm#126026 - llvm#127595 - llvm#127633 (this PR) - llvm#127634 - llvm#127635
1 parent df59706 commit 28b3143

File tree

8 files changed

+278
-190
lines changed

8 files changed

+278
-190
lines changed

flang/docs/DoConcurrentConversionToOpenMP.md

Lines changed: 47 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -126,6 +126,53 @@ see the "Data environment" section below.
126126
See `flang/test/Transforms/DoConcurrent/loop_nest_test.f90` for more examples
127127
of what is and is not detected as a perfect loop nest.
128128

129+
### Single-range loops
130+
131+
Given the following loop:
132+
```fortran
133+
do concurrent(i=1:n)
134+
a(i) = i * i
135+
end do
136+
```
137+
138+
#### Mapping to `host`
139+
140+
Mapping this loop to the `host`, generates MLIR operations of the following
141+
structure:
142+
143+
```
144+
%4 = fir.address_of(@_QFEa) ...
145+
%6:2 = hlfir.declare %4 ...
146+
147+
omp.parallel {
148+
// Allocate private copy for `i`.
149+
// TODO Use delayed privatization.
150+
%19 = fir.alloca i32 {bindc_name = "i"}
151+
%20:2 = hlfir.declare %19 {uniq_name = "_QFEi"} ...
152+
153+
omp.wsloop {
154+
omp.loop_nest (%arg0) : index = (%21) to (%22) inclusive step (%c1_2) {
155+
%23 = fir.convert %arg0 : (index) -> i32
156+
// Use the privatized version of `i`.
157+
fir.store %23 to %20#1 : !fir.ref<i32>
158+
...
159+
160+
// Use "shared" SSA value of `a`.
161+
%42 = hlfir.designate %6#0
162+
hlfir.assign %35 to %42
163+
...
164+
omp.yield
165+
}
166+
omp.terminator
167+
}
168+
omp.terminator
169+
}
170+
```
171+
172+
#### Mapping to `device`
173+
174+
<!-- TODO -->
175+
129176
<!--
130177
More details about current status will be added along with relevant parts of the
131178
implementation in later upstreaming patches.

flang/lib/Optimizer/OpenMP/DoConcurrentConversion.cpp

Lines changed: 156 additions & 118 deletions
Original file line numberDiff line numberDiff line change
@@ -161,18 +161,71 @@ void cloneOrMapRegionOutsiders(fir::FirOpBuilder &builder,
161161
namespace {
162162
namespace looputils {
163163
/// Stores info needed about the induction/iteration variable for each `do
164-
/// concurrent` in a loop nest. This includes:
165-
/// * the operation allocating memory for iteration variable,
166-
/// * the operation(s) updating the iteration variable with the current
167-
/// iteration number.
164+
/// concurrent` in a loop nest.
168165
struct InductionVariableInfo {
166+
/// The operation allocating memory for iteration variable.
169167
mlir::Operation *iterVarMemDef;
168+
169+
/// the operation(s) updating the iteration variable with the current
170+
/// iteration number.
170171
llvm::SetVector<mlir::Operation *> indVarUpdateOps;
171172
};
172173

173174
using LoopNestToIndVarMap =
174175
llvm::MapVector<fir::DoLoopOp, InductionVariableInfo>;
175176

177+
/// For the \p doLoop parameter, find the operation that declares its iteration
178+
/// variable or allocates memory for it.
179+
///
180+
/// For example, give the following loop:
181+
/// ```
182+
/// ...
183+
/// %i:2 = hlfir.declare %0 {uniq_name = "_QFEi"} : ...
184+
/// ...
185+
/// fir.do_loop %ind_var = %lb to %ub step %s unordered {
186+
/// %ind_var_conv = fir.convert %ind_var : (index) -> i32
187+
/// fir.store %ind_var_conv to %i#1 : !fir.ref<i32>
188+
/// ...
189+
/// }
190+
/// ```
191+
///
192+
/// This function returns the `hlfir.declare` op for `%i`.
193+
///
194+
/// Note: The current implementation is dependent on how flang emits loop
195+
/// bodies; which is sufficient for the current simple test/use cases. If this
196+
/// proves to be insufficient, this should be made more generic.
197+
mlir::Operation *findLoopIterationVarMemDecl(fir::DoLoopOp doLoop) {
198+
mlir::Value result = nullptr;
199+
200+
// Checks if a StoreOp is updating the memref of the loop's iteration
201+
// variable.
202+
auto isStoringIV = [&](fir::StoreOp storeOp) {
203+
// Direct store into the IV memref.
204+
if (storeOp.getValue() == doLoop.getInductionVar())
205+
return true;
206+
207+
// Indirect store into the IV memref.
208+
if (auto convertOp = mlir::dyn_cast<fir::ConvertOp>(
209+
storeOp.getValue().getDefiningOp())) {
210+
if (convertOp.getOperand() == doLoop.getInductionVar())
211+
return true;
212+
}
213+
214+
return false;
215+
};
216+
217+
for (mlir::Operation &op : doLoop) {
218+
if (auto storeOp = mlir::dyn_cast<fir::StoreOp>(op))
219+
if (isStoringIV(storeOp)) {
220+
result = storeOp.getMemref();
221+
break;
222+
}
223+
}
224+
225+
assert(result != nullptr && result.getDefiningOp() != nullptr);
226+
return result.getDefiningOp();
227+
}
228+
176229
/// Given an operation `op`, this returns true if `op`'s operand is ultimately
177230
/// the loop's induction variable. Detecting this helps finding the live-in
178231
/// value corresponding to the induction variable in case the induction variable
@@ -412,7 +465,7 @@ mlir::LogicalResult collectLoopNest(fir::DoLoopOp currentLoop,
412465
loopNest.insert(
413466
{currentLoop,
414467
InductionVariableInfo{
415-
findLoopIndVarMemDecl(currentLoop),
468+
findLoopIterationVarMemDecl(currentLoop),
416469
std::move(looputils::extractIndVarUpdateOps(currentLoop))}});
417470
llvm::SmallVector<fir::DoLoopOp> unorderedLoops;
418471

@@ -715,6 +768,104 @@ class DoConcurrentConversion : public mlir::OpConversionPattern<fir::DoLoopOp> {
715768
using LiveInShapeInfoMap =
716769
llvm::DenseMap<mlir::Value, TargetDeclareShapeCreationInfo>;
717770

771+
mlir::omp::ParallelOp genParallelOp(mlir::Location loc,
772+
mlir::ConversionPatternRewriter &rewriter,
773+
looputils::LoopNestToIndVarMap &loopNest,
774+
mlir::IRMapping &mapper) const {
775+
auto parallelOp = rewriter.create<mlir::omp::ParallelOp>(loc);
776+
rewriter.createBlock(&parallelOp.getRegion());
777+
rewriter.setInsertionPoint(rewriter.create<mlir::omp::TerminatorOp>(loc));
778+
779+
genLoopNestIndVarAllocs(rewriter, loopNest, mapper);
780+
return parallelOp;
781+
}
782+
783+
void genLoopNestIndVarAllocs(mlir::ConversionPatternRewriter &rewriter,
784+
looputils::LoopNestToIndVarMap &loopNest,
785+
mlir::IRMapping &mapper) const {
786+
787+
for (auto &[_, indVarInfo] : loopNest)
788+
genInductionVariableAlloc(rewriter, indVarInfo.iterVarMemDef, mapper);
789+
}
790+
791+
mlir::Operation *
792+
genInductionVariableAlloc(mlir::ConversionPatternRewriter &rewriter,
793+
mlir::Operation *indVarMemDef,
794+
mlir::IRMapping &mapper) const {
795+
assert(
796+
indVarMemDef != nullptr &&
797+
"Induction variable memdef is expected to have a defining operation.");
798+
799+
llvm::SmallSetVector<mlir::Operation *, 2> indVarDeclareAndAlloc;
800+
for (auto operand : indVarMemDef->getOperands())
801+
indVarDeclareAndAlloc.insert(operand.getDefiningOp());
802+
indVarDeclareAndAlloc.insert(indVarMemDef);
803+
804+
mlir::Operation *result;
805+
for (mlir::Operation *opToClone : indVarDeclareAndAlloc)
806+
result = rewriter.clone(*opToClone, mapper);
807+
808+
return result;
809+
}
810+
811+
void genLoopNestClauseOps(
812+
mlir::Location loc, mlir::ConversionPatternRewriter &rewriter,
813+
looputils::LoopNestToIndVarMap &loopNest, mlir::IRMapping &mapper,
814+
mlir::omp::LoopNestOperands &loopNestClauseOps,
815+
mlir::omp::TargetOperands *targetClauseOps = nullptr) const {
816+
assert(loopNestClauseOps.loopLowerBounds.empty() &&
817+
"Loop nest bounds were already emitted!");
818+
819+
auto populateBounds = [](mlir::Value var,
820+
llvm::SmallVectorImpl<mlir::Value> &bounds) {
821+
bounds.push_back(var.getDefiningOp()->getResult(0));
822+
};
823+
824+
auto hostEvalCapture = [&](mlir::Value var,
825+
llvm::SmallVectorImpl<mlir::Value> &bounds) {
826+
populateBounds(var, bounds);
827+
828+
if (targetClauseOps)
829+
targetClauseOps->hostEvalVars.push_back(var);
830+
};
831+
832+
for (auto &[doLoop, _] : loopNest) {
833+
hostEvalCapture(doLoop.getLowerBound(),
834+
loopNestClauseOps.loopLowerBounds);
835+
hostEvalCapture(doLoop.getUpperBound(),
836+
loopNestClauseOps.loopUpperBounds);
837+
hostEvalCapture(doLoop.getStep(), loopNestClauseOps.loopSteps);
838+
}
839+
840+
loopNestClauseOps.loopInclusive = rewriter.getUnitAttr();
841+
}
842+
843+
mlir::omp::LoopNestOp
844+
genWsLoopOp(mlir::ConversionPatternRewriter &rewriter, fir::DoLoopOp doLoop,
845+
mlir::IRMapping &mapper,
846+
const mlir::omp::LoopNestOperands &clauseOps,
847+
bool isComposite) const {
848+
849+
auto wsloopOp = rewriter.create<mlir::omp::WsloopOp>(doLoop.getLoc());
850+
wsloopOp.setComposite(isComposite);
851+
rewriter.createBlock(&wsloopOp.getRegion());
852+
853+
auto loopNestOp =
854+
rewriter.create<mlir::omp::LoopNestOp>(doLoop.getLoc(), clauseOps);
855+
856+
// Clone the loop's body inside the loop nest construct using the
857+
// mapped values.
858+
rewriter.cloneRegionBefore(doLoop.getRegion(), loopNestOp.getRegion(),
859+
loopNestOp.getRegion().begin(), mapper);
860+
861+
mlir::Operation *terminator = loopNestOp.getRegion().back().getTerminator();
862+
rewriter.setInsertionPointToEnd(&loopNestOp.getRegion().back());
863+
rewriter.create<mlir::omp::YieldOp>(terminator->getLoc());
864+
rewriter.eraseOp(terminator);
865+
866+
return loopNestOp;
867+
}
868+
718869
void
719870
genBoundsOps(mlir::ConversionPatternRewriter &rewriter, mlir::Location loc,
720871
mlir::Value shape, llvm::SmallVectorImpl<mlir::Value> &boundsOps,
@@ -983,51 +1134,6 @@ class DoConcurrentConversion : public mlir::OpConversionPattern<fir::DoLoopOp> {
9831134
return teamsOp;
9841135
}
9851136

986-
void genLoopNestClauseOps(
987-
mlir::Location loc, mlir::ConversionPatternRewriter &rewriter,
988-
looputils::LoopNestToIndVarMap &loopNest, mlir::IRMapping &mapper,
989-
mlir::omp::LoopNestOperands &loopNestClauseOps,
990-
mlir::omp::TargetOperands *targetClauseOps = nullptr) const {
991-
assert(loopNestClauseOps.loopLowerBounds.empty() &&
992-
"Loop nest bounds were already emitted!");
993-
994-
// Clones the chain of ops defining a certain loop bound or its step into
995-
// the parallel region. For example, if the value of a bound is defined by a
996-
// `fir.convert`op, this lambda clones the `fir.convert` as well as the
997-
// value it converts from. We do this since `omp.target` regions are
998-
// isolated from above.
999-
auto cloneBoundOrStepOpChain =
1000-
[&](mlir::Operation *operation) -> mlir::Operation * {
1001-
llvm::SetVector<mlir::Operation *> opChain;
1002-
looputils::collectIndirectConstOpChain(operation, opChain);
1003-
1004-
mlir::Operation *result;
1005-
for (mlir::Operation *link : opChain)
1006-
result = rewriter.clone(*link, mapper);
1007-
1008-
return result;
1009-
};
1010-
1011-
auto hostEvalCapture = [&](mlir::Value var,
1012-
llvm::SmallVectorImpl<mlir::Value> &bounds) {
1013-
var = cloneBoundOrStepOpChain(var.getDefiningOp())->getResult(0);
1014-
bounds.push_back(var);
1015-
1016-
if (targetClauseOps)
1017-
targetClauseOps->hostEvalVars.push_back(var);
1018-
};
1019-
1020-
for (auto &[doLoop, _] : loopNest) {
1021-
hostEvalCapture(doLoop.getLowerBound(),
1022-
loopNestClauseOps.loopLowerBounds);
1023-
hostEvalCapture(doLoop.getUpperBound(),
1024-
loopNestClauseOps.loopUpperBounds);
1025-
hostEvalCapture(doLoop.getStep(), loopNestClauseOps.loopSteps);
1026-
}
1027-
1028-
loopNestClauseOps.loopInclusive = rewriter.getUnitAttr();
1029-
}
1030-
10311137
mlir::omp::DistributeOp
10321138
genDistributeOp(mlir::Location loc,
10331139
mlir::ConversionPatternRewriter &rewriter) const {
@@ -1038,72 +1144,6 @@ class DoConcurrentConversion : public mlir::OpConversionPattern<fir::DoLoopOp> {
10381144
return distOp;
10391145
}
10401146

1041-
void genLoopNestIndVarAllocs(mlir::ConversionPatternRewriter &rewriter,
1042-
looputils::LoopNestToIndVarMap &loopNest,
1043-
mlir::IRMapping &mapper) const {
1044-
1045-
for (auto &[_, indVarInfo] : loopNest)
1046-
genInductionVariableAlloc(rewriter, indVarInfo.iterVarMemDef, mapper);
1047-
}
1048-
1049-
mlir::Operation *
1050-
genInductionVariableAlloc(mlir::ConversionPatternRewriter &rewriter,
1051-
mlir::Operation *indVarMemDef,
1052-
mlir::IRMapping &mapper) const {
1053-
assert(
1054-
indVarMemDef != nullptr &&
1055-
"Induction variable memdef is expected to have a defining operation.");
1056-
1057-
llvm::SmallSetVector<mlir::Operation *, 2> indVarDeclareAndAlloc;
1058-
for (auto operand : indVarMemDef->getOperands())
1059-
indVarDeclareAndAlloc.insert(operand.getDefiningOp());
1060-
indVarDeclareAndAlloc.insert(indVarMemDef);
1061-
1062-
mlir::Operation *result;
1063-
for (mlir::Operation *opToClone : indVarDeclareAndAlloc)
1064-
result = rewriter.clone(*opToClone, mapper);
1065-
1066-
return result;
1067-
}
1068-
1069-
mlir::omp::ParallelOp genParallelOp(mlir::Location loc,
1070-
mlir::ConversionPatternRewriter &rewriter,
1071-
looputils::LoopNestToIndVarMap &loopNest,
1072-
mlir::IRMapping &mapper) const {
1073-
auto parallelOp = rewriter.create<mlir::omp::ParallelOp>(loc);
1074-
rewriter.createBlock(&parallelOp.getRegion());
1075-
rewriter.setInsertionPoint(rewriter.create<mlir::omp::TerminatorOp>(loc));
1076-
1077-
genLoopNestIndVarAllocs(rewriter, loopNest, mapper);
1078-
return parallelOp;
1079-
}
1080-
1081-
mlir::omp::LoopNestOp
1082-
genWsLoopOp(mlir::ConversionPatternRewriter &rewriter, fir::DoLoopOp doLoop,
1083-
mlir::IRMapping &mapper,
1084-
const mlir::omp::LoopNestOperands &clauseOps,
1085-
bool isComposite) const {
1086-
1087-
auto wsloopOp = rewriter.create<mlir::omp::WsloopOp>(doLoop.getLoc());
1088-
wsloopOp.setComposite(isComposite);
1089-
rewriter.createBlock(&wsloopOp.getRegion());
1090-
1091-
auto loopNestOp =
1092-
rewriter.create<mlir::omp::LoopNestOp>(doLoop.getLoc(), clauseOps);
1093-
1094-
// Clone the loop's body inside the loop nest construct using the
1095-
// mapped values.
1096-
rewriter.cloneRegionBefore(doLoop.getRegion(), loopNestOp.getRegion(),
1097-
loopNestOp.getRegion().begin(), mapper);
1098-
1099-
mlir::Operation *terminator = loopNestOp.getRegion().back().getTerminator();
1100-
rewriter.setInsertionPointToEnd(&loopNestOp.getRegion().back());
1101-
rewriter.create<mlir::omp::YieldOp>(terminator->getLoc());
1102-
rewriter.eraseOp(terminator);
1103-
1104-
return loopNestOp;
1105-
}
1106-
11071147
bool mapToDevice;
11081148
llvm::DenseSet<fir::DoLoopOp> &concurrentLoopsToSkip;
11091149
};
@@ -1152,8 +1192,6 @@ class DoConcurrentConversionPass
11521192

11531193
if (mlir::failed(mlir::applyFullConversion(getOperation(), target,
11541194
std::move(patterns)))) {
1155-
mlir::emitError(mlir::UnknownLoc::get(context),
1156-
"error in converting do-concurrent op");
11571195
signalPassFailure();
11581196
}
11591197
}

flang/test/Transforms/DoConcurrent/basic_device.f90

Lines changed: 0 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -21,12 +21,6 @@ program do_concurrent_basic
2121

2222
! CHECK-NOT: fir.do_loop
2323

24-
! CHECK: %[[DUPLICATED_C1:.*]] = arith.constant 1 : i32
25-
! CHECK: %[[DUPLICATED_LB:.*]] = fir.convert %[[DUPLICATED_C1]] : (i32) -> index
26-
! CHECK: %[[DUPLICATED_C10:.*]] = arith.constant 10 : i32
27-
! CHECK: %[[DUPLICATED_UB:.*]] = fir.convert %[[DUPLICATED_C10]] : (i32) -> index
28-
! CHECK: %[[DUPLICATED_STEP:.*]] = arith.constant 1 : index
29-
3024
! CHECK: %[[C1:.*]] = arith.constant 1 : i32
3125
! CHECK: %[[HOST_LB:.*]] = fir.convert %[[C1]] : (i32) -> index
3226
! CHECK: %[[C10:.*]] = arith.constant 10 : i32

0 commit comments

Comments
 (0)