Skip to content

Commit 0d40574

Browse files
authored
[flang] Inline hlfir.copy_in for trivial types (#138718)
hlfir.copy_in implements copying non-contiguous array slices for functions that take in arrays required to be contiguous through flang-rt. For large arrays of trivial types, this can incur overhead compared to a plain, inlined copy loop. To address that, add a new InlineHLFIRCopyIn optimisation pass to inline hlfir.copy_in operations for trivial types. For the time being, the pattern is only applied in cases where the copy-in does not require a corresponding copy-out, such as when the function being called declares the array parameter as intent(in). Applying this optimisation reduces the runtime of thornado-mini's DeleptonizationProblem by about 10%. --------- Signed-off-by: Kajetan Puchalski <[email protected]>
1 parent c4012bb commit 0d40574

File tree

7 files changed

+416
-4
lines changed

7 files changed

+416
-4
lines changed

flang/include/flang/Optimizer/Builder/HLFIRTools.h

Lines changed: 5 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -374,12 +374,14 @@ struct LoopNest {
374374
/// loop constructs currently.
375375
LoopNest genLoopNest(mlir::Location loc, fir::FirOpBuilder &builder,
376376
mlir::ValueRange extents, bool isUnordered = false,
377-
bool emitWorkshareLoop = false);
377+
bool emitWorkshareLoop = false,
378+
bool couldVectorize = true);
378379
inline LoopNest genLoopNest(mlir::Location loc, fir::FirOpBuilder &builder,
379380
mlir::Value shape, bool isUnordered = false,
380-
bool emitWorkshareLoop = false) {
381+
bool emitWorkshareLoop = false,
382+
bool couldVectorize = true) {
381383
return genLoopNest(loc, builder, getIndexExtents(loc, builder, shape),
382-
isUnordered, emitWorkshareLoop);
384+
isUnordered, emitWorkshareLoop, couldVectorize);
383385
}
384386

385387
/// The type of a callback that generates the body of a reduction

flang/include/flang/Optimizer/HLFIR/Passes.td

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -69,6 +69,10 @@ def InlineHLFIRAssign : Pass<"inline-hlfir-assign"> {
6969
let summary = "Inline hlfir.assign operations";
7070
}
7171

72+
def InlineHLFIRCopyIn : Pass<"inline-hlfir-copy-in"> {
73+
let summary = "Inline hlfir.copy_in operations";
74+
}
75+
7276
def PropagateFortranVariableAttributes : Pass<"propagate-fortran-attrs"> {
7377
let summary = "Propagate FortranVariableFlagsAttr attributes through HLFIR";
7478
}

flang/lib/Optimizer/Builder/HLFIRTools.cpp

Lines changed: 12 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -21,6 +21,7 @@
2121
#include "mlir/IR/IRMapping.h"
2222
#include "mlir/Support/LLVM.h"
2323
#include "llvm/ADT/TypeSwitch.h"
24+
#include <mlir/Dialect/LLVMIR/LLVMAttrs.h>
2425
#include <mlir/Dialect/OpenMP/OpenMPDialect.h>
2526
#include <optional>
2627

@@ -932,7 +933,8 @@ mlir::Value hlfir::inlineElementalOp(
932933
hlfir::LoopNest hlfir::genLoopNest(mlir::Location loc,
933934
fir::FirOpBuilder &builder,
934935
mlir::ValueRange extents, bool isUnordered,
935-
bool emitWorkshareLoop) {
936+
bool emitWorkshareLoop,
937+
bool couldVectorize) {
936938
emitWorkshareLoop = emitWorkshareLoop && isUnordered;
937939
hlfir::LoopNest loopNest;
938940
assert(!extents.empty() && "must have at least one extent");
@@ -967,6 +969,15 @@ hlfir::LoopNest hlfir::genLoopNest(mlir::Location loc,
967969
auto ub = builder.createConvert(loc, indexType, extent);
968970
auto doLoop =
969971
builder.create<fir::DoLoopOp>(loc, one, ub, one, isUnordered);
972+
if (!couldVectorize) {
973+
mlir::LLVM::LoopVectorizeAttr va{mlir::LLVM::LoopVectorizeAttr::get(
974+
builder.getContext(),
975+
/*disable=*/builder.getBoolAttr(true), {}, {}, {}, {}, {}, {})};
976+
mlir::LLVM::LoopAnnotationAttr la = mlir::LLVM::LoopAnnotationAttr::get(
977+
builder.getContext(), {}, /*vectorize=*/va, {}, /*unroll*/ {},
978+
/*unroll_and_jam*/ {}, {}, {}, {}, {}, {}, {}, {}, {}, {}, {});
979+
doLoop.setLoopAnnotationAttr(la);
980+
}
970981
loopNest.body = doLoop.getBody();
971982
builder.setInsertionPointToStart(loopNest.body);
972983
// Reverse the indices so they are in column-major order.

flang/lib/Optimizer/HLFIR/Transforms/CMakeLists.txt

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -5,6 +5,7 @@ add_flang_library(HLFIRTransforms
55
ConvertToFIR.cpp
66
InlineElementals.cpp
77
InlineHLFIRAssign.cpp
8+
InlineHLFIRCopyIn.cpp
89
LowerHLFIRIntrinsics.cpp
910
LowerHLFIROrderedAssignments.cpp
1011
ScheduleOrderedAssignments.cpp
Lines changed: 183 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,183 @@
1+
//===- InlineHLFIRCopyIn.cpp - Inline hlfir.copy_in ops -------------------===//
2+
//
3+
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4+
// See https://llvm.org/LICENSE.txt for license information.
5+
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6+
//
7+
//===----------------------------------------------------------------------===//
8+
// Transform hlfir.copy_in array operations into loop nests performing element
9+
// per element assignments. For simplicity, the inlining is done for trivial
10+
// data types when the copy_in does not require a corresponding copy_out and
11+
// when the input array is not behind a pointer. This may change in the future.
12+
//===----------------------------------------------------------------------===//
13+
14+
#include "flang/Optimizer/Builder/FIRBuilder.h"
15+
#include "flang/Optimizer/Builder/HLFIRTools.h"
16+
#include "flang/Optimizer/Dialect/FIRType.h"
17+
#include "flang/Optimizer/HLFIR/HLFIROps.h"
18+
#include "flang/Optimizer/OpenMP/Passes.h"
19+
#include "mlir/IR/PatternMatch.h"
20+
#include "mlir/Support/LLVM.h"
21+
#include "mlir/Transforms/GreedyPatternRewriteDriver.h"
22+
23+
namespace hlfir {
24+
#define GEN_PASS_DEF_INLINEHLFIRCOPYIN
25+
#include "flang/Optimizer/HLFIR/Passes.h.inc"
26+
} // namespace hlfir
27+
28+
#define DEBUG_TYPE "inline-hlfir-copy-in"
29+
30+
static llvm::cl::opt<bool> noInlineHLFIRCopyIn(
31+
"no-inline-hlfir-copy-in",
32+
llvm::cl::desc("Do not inline hlfir.copy_in operations"),
33+
llvm::cl::init(false));
34+
35+
namespace {
36+
class InlineCopyInConversion : public mlir::OpRewritePattern<hlfir::CopyInOp> {
37+
public:
38+
using mlir::OpRewritePattern<hlfir::CopyInOp>::OpRewritePattern;
39+
40+
llvm::LogicalResult
41+
matchAndRewrite(hlfir::CopyInOp copyIn,
42+
mlir::PatternRewriter &rewriter) const override;
43+
};
44+
45+
llvm::LogicalResult
46+
InlineCopyInConversion::matchAndRewrite(hlfir::CopyInOp copyIn,
47+
mlir::PatternRewriter &rewriter) const {
48+
fir::FirOpBuilder builder(rewriter, copyIn.getOperation());
49+
mlir::Location loc = copyIn.getLoc();
50+
hlfir::Entity inputVariable{copyIn.getVar()};
51+
mlir::Type resultAddrType = copyIn.getCopiedIn().getType();
52+
if (!fir::isa_trivial(inputVariable.getFortranElementType()))
53+
return rewriter.notifyMatchFailure(copyIn,
54+
"CopyInOp's data type is not trivial");
55+
56+
// There should be exactly one user of WasCopied - the corresponding
57+
// CopyOutOp.
58+
if (!copyIn.getWasCopied().hasOneUse())
59+
return rewriter.notifyMatchFailure(
60+
copyIn, "CopyInOp's WasCopied has no single user");
61+
// The copy out should always be present, either to actually copy or just
62+
// deallocate memory.
63+
auto copyOut = mlir::dyn_cast<hlfir::CopyOutOp>(
64+
copyIn.getWasCopied().user_begin().getCurrent().getUser());
65+
66+
if (!copyOut)
67+
return rewriter.notifyMatchFailure(copyIn,
68+
"CopyInOp has no direct CopyOut");
69+
70+
if (mlir::cast<fir::BaseBoxType>(resultAddrType).isAssumedRank())
71+
return rewriter.notifyMatchFailure(copyIn,
72+
"The result array is assumed-rank");
73+
74+
// Only inline the copy_in when copy_out does not need to be done, i.e. in
75+
// case of intent(in).
76+
if (copyOut.getVar())
77+
return rewriter.notifyMatchFailure(copyIn, "CopyIn needs a copy-out");
78+
79+
inputVariable =
80+
hlfir::derefPointersAndAllocatables(loc, builder, inputVariable);
81+
mlir::Type sequenceType =
82+
hlfir::getFortranElementOrSequenceType(inputVariable.getType());
83+
fir::BoxType resultBoxType = fir::BoxType::get(sequenceType);
84+
mlir::Value isContiguous =
85+
builder.create<fir::IsContiguousBoxOp>(loc, inputVariable);
86+
mlir::Operation::result_range results =
87+
builder
88+
.genIfOp(loc, {resultBoxType, builder.getI1Type()}, isContiguous,
89+
/*withElseRegion=*/true)
90+
.genThen([&]() {
91+
mlir::Value result = inputVariable;
92+
if (fir::isPointerType(inputVariable.getType())) {
93+
result = builder.create<fir::ReboxOp>(
94+
loc, resultBoxType, inputVariable, mlir::Value{},
95+
mlir::Value{});
96+
}
97+
builder.create<fir::ResultOp>(
98+
loc, mlir::ValueRange{result, builder.createBool(loc, false)});
99+
})
100+
.genElse([&] {
101+
mlir::Value shape = hlfir::genShape(loc, builder, inputVariable);
102+
llvm::SmallVector<mlir::Value> extents =
103+
hlfir::getIndexExtents(loc, builder, shape);
104+
llvm::StringRef tmpName{".tmp.copy_in"};
105+
llvm::SmallVector<mlir::Value> lenParams;
106+
mlir::Value alloc = builder.createHeapTemporary(
107+
loc, sequenceType, tmpName, extents, lenParams);
108+
109+
auto declareOp = builder.create<hlfir::DeclareOp>(
110+
loc, alloc, tmpName, shape, lenParams,
111+
/*dummy_scope=*/nullptr);
112+
hlfir::Entity temp{declareOp.getBase()};
113+
hlfir::LoopNest loopNest =
114+
hlfir::genLoopNest(loc, builder, extents, /*isUnordered=*/true,
115+
flangomp::shouldUseWorkshareLowering(copyIn),
116+
/*couldVectorize=*/false);
117+
builder.setInsertionPointToStart(loopNest.body);
118+
hlfir::Entity elem = hlfir::getElementAt(
119+
loc, builder, inputVariable, loopNest.oneBasedIndices);
120+
elem = hlfir::loadTrivialScalar(loc, builder, elem);
121+
hlfir::Entity tempElem = hlfir::getElementAt(
122+
loc, builder, temp, loopNest.oneBasedIndices);
123+
builder.create<hlfir::AssignOp>(loc, elem, tempElem);
124+
builder.setInsertionPointAfter(loopNest.outerOp);
125+
126+
mlir::Value result;
127+
// Make sure the result is always a boxed array by boxing it
128+
// ourselves if need be.
129+
if (mlir::isa<fir::BaseBoxType>(temp.getType())) {
130+
result = temp;
131+
} else {
132+
fir::ReferenceType refTy =
133+
fir::ReferenceType::get(temp.getElementOrSequenceType());
134+
mlir::Value refVal = builder.createConvert(loc, refTy, temp);
135+
result = builder.create<fir::EmboxOp>(loc, resultBoxType, refVal,
136+
shape);
137+
}
138+
139+
builder.create<fir::ResultOp>(
140+
loc, mlir::ValueRange{result, builder.createBool(loc, true)});
141+
})
142+
.getResults();
143+
144+
mlir::OpResult resultBox = results[0];
145+
mlir::OpResult needsCleanup = results[1];
146+
147+
// Prepare the corresponding copyOut to free the temporary if it is required
148+
auto alloca = builder.create<fir::AllocaOp>(loc, resultBox.getType());
149+
auto store = builder.create<fir::StoreOp>(loc, resultBox, alloca);
150+
rewriter.startOpModification(copyOut);
151+
copyOut->setOperand(0, store.getMemref());
152+
copyOut->setOperand(1, needsCleanup);
153+
rewriter.finalizeOpModification(copyOut);
154+
155+
rewriter.replaceOp(copyIn, {resultBox, builder.genNot(loc, isContiguous)});
156+
return mlir::success();
157+
}
158+
159+
class InlineHLFIRCopyInPass
160+
: public hlfir::impl::InlineHLFIRCopyInBase<InlineHLFIRCopyInPass> {
161+
public:
162+
void runOnOperation() override {
163+
mlir::MLIRContext *context = &getContext();
164+
165+
mlir::GreedyRewriteConfig config;
166+
// Prevent the pattern driver from merging blocks.
167+
config.setRegionSimplificationLevel(
168+
mlir::GreedySimplifyRegionLevel::Disabled);
169+
170+
mlir::RewritePatternSet patterns(context);
171+
if (!noInlineHLFIRCopyIn) {
172+
patterns.insert<InlineCopyInConversion>(context);
173+
}
174+
175+
if (mlir::failed(mlir::applyPatternsGreedily(
176+
getOperation(), std::move(patterns), config))) {
177+
mlir::emitError(getOperation()->getLoc(),
178+
"failure in hlfir.copy_in inlining");
179+
signalPassFailure();
180+
}
181+
}
182+
};
183+
} // namespace

flang/lib/Optimizer/Passes/Pipelines.cpp

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -260,6 +260,11 @@ void createHLFIRToFIRPassPipeline(mlir::PassManager &pm, bool enableOpenMP,
260260
pm, hlfir::createOptimizedBufferization);
261261
addNestedPassToAllTopLevelOperations<PassConstructor>(
262262
pm, hlfir::createInlineHLFIRAssign);
263+
264+
if (optLevel == llvm::OptimizationLevel::O3) {
265+
addNestedPassToAllTopLevelOperations<PassConstructor>(
266+
pm, hlfir::createInlineHLFIRCopyIn);
267+
}
263268
}
264269
pm.addPass(hlfir::createLowerHLFIROrderedAssignments());
265270
pm.addPass(hlfir::createLowerHLFIRIntrinsics());

0 commit comments

Comments
 (0)