-
Notifications
You must be signed in to change notification settings - Fork 14.3k
[Flang] Hoisting constant-sized allocas at flang codegen. #95310
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Conversation
@llvm/pr-subscribers-flang-openmp @llvm/pr-subscribers-flang-fir-hlfir Author: Vijay Kandiah (VijayKandiah) ChangesThis change modifies the When compiling the example subroutine below with
Without the proposed change, downstream LLVM compilation cannot hoist this constant-sized alloca out of the stacksave/stackrestore region which may lead to missed downstream optimizations:
With this change, the
Full diff: https://github.com/llvm/llvm-project/pull/95310.diff 3 Files Affected:
diff --git a/flang/include/flang/Optimizer/CodeGen/FIROpPatterns.h b/flang/include/flang/Optimizer/CodeGen/FIROpPatterns.h
index 211acdc8a38e6..6ace73e2d16af 100644
--- a/flang/include/flang/Optimizer/CodeGen/FIROpPatterns.h
+++ b/flang/include/flang/Optimizer/CodeGen/FIROpPatterns.h
@@ -51,7 +51,9 @@ class ConvertFIRToLLVMPattern : public mlir::ConvertToLLVMPattern {
/// appropriate reified structures.
mlir::Value integerCast(mlir::Location loc,
mlir::ConversionPatternRewriter &rewriter,
- mlir::Type ty, mlir::Value val) const;
+ mlir::Type ty, mlir::Value val,
+ bool fold = false) const;
+
struct TypePair {
mlir::Type fir;
mlir::Type llvm;
@@ -144,9 +146,10 @@ class ConvertFIRToLLVMPattern : public mlir::ConvertToLLVMPattern {
// Find the Block in which the alloca should be inserted.
// The order to recursively find the proper block:
// 1. An OpenMP Op that will be outlined.
- // 2. A LLVMFuncOp
- // 3. The first ancestor that is an OpenMP Op or a LLVMFuncOp
- mlir::Block *getBlockForAllocaInsert(mlir::Operation *op) const;
+ // 2. An OpenMP or OpenACC Op with one or more regions holding executable code.
+ // 3. A LLVMFuncOp
+ // 4. The first ancestor that is one of the above.
+ mlir::Block *getBlockForAllocaInsert(mlir::Operation *op, mlir::Region *parentRegion) const;
// Generate an alloca of size 1 for an object of type \p llvmObjectTy in the
// allocation address space provided for the architecture in the DataLayout
diff --git a/flang/lib/Optimizer/CodeGen/CodeGen.cpp b/flang/lib/Optimizer/CodeGen/CodeGen.cpp
index 9f21c6b0cf097..d078a000ccd65 100644
--- a/flang/lib/Optimizer/CodeGen/CodeGen.cpp
+++ b/flang/lib/Optimizer/CodeGen/CodeGen.cpp
@@ -218,7 +218,7 @@ struct AllocaOpConversion : public fir::FIROpConversion<fir::AllocaOp> {
chrTy.getContext(), chrTy.getFKind());
llvmObjectType = convertType(rawCharTy);
assert(end == 1);
- size = integerCast(loc, rewriter, ity, lenParams[0]);
+ size = integerCast(loc, rewriter, ity, lenParams[0], /*fold=*/true);
} else if (auto recTy = mlir::dyn_cast<fir::RecordType>(scalarType)) {
mlir::LLVM::LLVMFuncOp memSizeFn =
getDependentTypeMemSizeFn(recTy, alloc, rewriter);
@@ -236,17 +236,27 @@ struct AllocaOpConversion : public fir::FIROpConversion<fir::AllocaOp> {
}
}
if (auto scaleSize = genAllocationScaleSize(alloc, ity, rewriter))
- size = rewriter.create<mlir::LLVM::MulOp>(loc, ity, size, scaleSize);
+ size = rewriter.createOrFold<mlir::LLVM::MulOp>(loc, ity, size, scaleSize);
if (alloc.hasShapeOperands()) {
unsigned end = operands.size();
for (; i < end; ++i)
- size = rewriter.create<mlir::LLVM::MulOp>(
- loc, ity, size, integerCast(loc, rewriter, ity, operands[i]));
+ size = rewriter.createOrFold<mlir::LLVM::MulOp>(
+ loc, ity, size,
+ integerCast(loc, rewriter, ity, operands[i], /*fold=*/true));
}
unsigned allocaAs = getAllocaAddressSpace(rewriter);
unsigned programAs = getProgramAddressSpace(rewriter);
+ if (mlir::isa<mlir::LLVM::ConstantOp>(size.getDefiningOp())) {
+ // Set the Block in which the llvm alloca should be inserted.
+ mlir::Operation *parentOp = rewriter.getInsertionBlock()->getParentOp();
+ mlir::Region *parentRegion = rewriter.getInsertionBlock()->getParent();
+ mlir::Block *insertBlock = getBlockForAllocaInsert(parentOp, parentRegion);
+ size.getDefiningOp()->moveAfter(insertBlock, insertBlock->begin());
+ rewriter.setInsertionPointAfter(size.getDefiningOp());
+ }
+
// NOTE: we used to pass alloc->getAttrs() in the builder for non opaque
// pointers! Only propagate pinned and bindc_name to help debugging, but
// this should have no functional purpose (and passing the operand segment
diff --git a/flang/lib/Optimizer/CodeGen/FIROpPatterns.cpp b/flang/lib/Optimizer/CodeGen/FIROpPatterns.cpp
index 72e072db37432..6d86879cd3219 100644
--- a/flang/lib/Optimizer/CodeGen/FIROpPatterns.cpp
+++ b/flang/lib/Optimizer/CodeGen/FIROpPatterns.cpp
@@ -65,7 +65,7 @@ mlir::LLVM::ConstantOp ConvertFIRToLLVMPattern::genConstantOffset(
mlir::Value
ConvertFIRToLLVMPattern::integerCast(mlir::Location loc,
mlir::ConversionPatternRewriter &rewriter,
- mlir::Type ty, mlir::Value val) const {
+ mlir::Type ty, mlir::Value val, bool fold) const {
auto valTy = val.getType();
// If the value was not yet lowered, lower its type so that it can
// be used in getPrimitiveTypeSizeInBits.
@@ -73,10 +73,17 @@ ConvertFIRToLLVMPattern::integerCast(mlir::Location loc,
valTy = convertType(valTy);
auto toSize = mlir::LLVM::getPrimitiveTypeSizeInBits(ty);
auto fromSize = mlir::LLVM::getPrimitiveTypeSizeInBits(valTy);
- if (toSize < fromSize)
- return rewriter.create<mlir::LLVM::TruncOp>(loc, ty, val);
- if (toSize > fromSize)
- return rewriter.create<mlir::LLVM::SExtOp>(loc, ty, val);
+ if (fold) {
+ if (toSize < fromSize)
+ return rewriter.createOrFold<mlir::LLVM::TruncOp>(loc, ty, val);
+ if (toSize > fromSize)
+ return rewriter.createOrFold<mlir::LLVM::SExtOp>(loc, ty, val);
+ } else {
+ if (toSize < fromSize)
+ return rewriter.create<mlir::LLVM::TruncOp>(loc, ty, val);
+ if (toSize > fromSize)
+ return rewriter.create<mlir::LLVM::SExtOp>(loc, ty, val);
+ }
return val;
}
@@ -274,16 +281,19 @@ mlir::Value ConvertFIRToLLVMPattern::computeBoxSize(
// Find the Block in which the alloca should be inserted.
// The order to recursively find the proper block:
// 1. An OpenMP Op that will be outlined.
-// 2. A LLVMFuncOp
-// 3. The first ancestor that is an OpenMP Op or a LLVMFuncOp
-mlir::Block *
-ConvertFIRToLLVMPattern::getBlockForAllocaInsert(mlir::Operation *op) const {
+// 2. An OpenMP or OpenACC Op with one or more regions holding executable code.
+// 3. A LLVMFuncOp
+// 4. The first ancestor that is one of the above.
+mlir::Block *ConvertFIRToLLVMPattern::getBlockForAllocaInsert(
+ mlir::Operation *op, mlir::Region *parentRegion) const {
if (auto iface = mlir::dyn_cast<mlir::omp::OutlineableOpenMPOpInterface>(op))
return iface.getAllocaBlock();
+ if (auto recipeIface = mlir::dyn_cast<mlir::accomp::RecipeInterface>(op))
+ return recipeIface.getAllocaBlock(*parentRegion);
if (auto llvmFuncOp = mlir::dyn_cast<mlir::LLVM::LLVMFuncOp>(op))
return &llvmFuncOp.front();
- return getBlockForAllocaInsert(op->getParentOp());
+ return getBlockForAllocaInsert(op->getParentOp(), parentRegion);
}
// Generate an alloca of size 1 for an object of type \p llvmObjectTy in the
@@ -297,16 +307,9 @@ mlir::Value ConvertFIRToLLVMPattern::genAllocaAndAddrCastWithType(
mlir::ConversionPatternRewriter &rewriter) const {
auto thisPt = rewriter.saveInsertionPoint();
mlir::Operation *parentOp = rewriter.getInsertionBlock()->getParentOp();
- if (mlir::isa<mlir::omp::DeclareReductionOp>(parentOp) ||
- mlir::isa<mlir::omp::PrivateClauseOp>(parentOp)) {
- // DeclareReductionOp & PrivateClauseOp have multiple child regions. We want
- // to get the first block of whichever of those regions we are currently in
- mlir::Region *parentRegion = rewriter.getInsertionBlock()->getParent();
- rewriter.setInsertionPointToStart(&parentRegion->front());
- } else {
- mlir::Block *insertBlock = getBlockForAllocaInsert(parentOp);
- rewriter.setInsertionPointToStart(insertBlock);
- }
+ mlir::Region *parentRegion = rewriter.getInsertionBlock()->getParent();
+ mlir::Block *insertBlock = getBlockForAllocaInsert(parentOp, parentRegion);
+ rewriter.setInsertionPointToStart(insertBlock);
auto size = genI32Constant(loc, rewriter, 1);
unsigned allocaAs = getAllocaAddressSpace(rewriter);
unsigned programAs = getProgramAddressSpace(rewriter);
|
✅ With the latest revision this PR passed the C/C++ code formatter. |
Are you also hoisting descriptors? Something like:
|
Yes, I am hoisting descriptors too. Their |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
LGTM
You have couple of lit tests failure on the pre-commit CI. |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
LGTM once CI passes
// CHECK: %[[VAL_2:.*]] = llvm.mlir.constant(1 : i64) : i64 | ||
// CHECK: %[[VAL_3:.*]] = llvm.alloca %[[VAL_2]] x !llvm.array<1024 x i32> {bindc_name = "b"} : (i64) -> !llvm.ptr | ||
// CHECK: %[[VAL_0:.*]] = llvm.mlir.constant(1 : i64) : i64 | ||
// CHECK: %[[VAL_1:.*]] = llvm.alloca %[[VAL_0]] x !llvm.array<1024 x i32> {bindc_name = "a"} : (i64) -> !llvm.ptr | ||
// CHECK: %9 = llvm.mlir.constant(1024 : index) : i64 |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Not your change, but I suggest regenerating these checks from scratch to get rid of the numbered SSA values in the checks.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Got it, I regenerated the checks from scratch for this function.
Thank you, Vijay! |
This change modifies the
AllocaOpConversion
in flang codegen to insert constant-sized LLVM allocas at the entry block ofLLVMFuncOp
or OpenACC/OpenMP Op, rather than in-place at thefir.alloca
. This effectively hoists constant-sized FIR allocas to the proper block.When compiling the example subroutine below with
flang-new
, we get a llvm.stacksave/stackrestore pair around a constant-sizedfir.alloca i32
.Without the proposed change, downstream LLVM compilation cannot hoist this constant-sized alloca out of the stacksave/stackrestore region which may lead to missed downstream optimizations:
With this change, the
llvm.alloca
is already hoisted out of the stacksave/stackrestore region during flang codegen: