-
Notifications
You must be signed in to change notification settings - Fork 14.3k
[X86,SimplifyCFG] Support hoisting load/store with conditional faulting (Part II) #108812
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Conversation
@llvm/pr-subscribers-llvm-transforms Author: Phoebe Wang (phoebewang) ChangesThis is a follow up of #96878 to support hoisting load/store for diamond CFG.
Full diff: https://github.com/llvm/llvm-project/pull/108812.diff 2 Files Affected:
diff --git a/llvm/lib/Transforms/Utils/SimplifyCFG.cpp b/llvm/lib/Transforms/Utils/SimplifyCFG.cpp
index f9db996cdc3583..420c59789813c1 100644
--- a/llvm/lib/Transforms/Utils/SimplifyCFG.cpp
+++ b/llvm/lib/Transforms/Utils/SimplifyCFG.cpp
@@ -283,7 +283,7 @@ class SimplifyCFGOpt {
bool tryToSimplifyUncondBranchWithICmpInIt(ICmpInst *ICI,
IRBuilder<> &Builder);
- bool hoistCommonCodeFromSuccessors(BasicBlock *BB, bool EqTermsOnly);
+ bool hoistCommonCodeFromSuccessors(Instruction *TI, bool EqTermsOnly);
bool hoistSuccIdenticalTerminatorToSwitchOrIf(
Instruction *TI, Instruction *I1,
SmallVectorImpl<Instruction *> &OtherSuccTIs);
@@ -1615,12 +1615,31 @@ static bool areIdenticalUpToCommutativity(const Instruction *I1,
return false;
}
+static bool isSafeCheapLoadStore(const Instruction *I,
+ const TargetTransformInfo &TTI) {
+ // Not handle volatile or atomic.
+ if (auto *L = dyn_cast<LoadInst>(I)) {
+ if (!L->isSimple())
+ return false;
+ } else if (auto *S = dyn_cast<StoreInst>(I)) {
+ if (!S->isSimple())
+ return false;
+ } else
+ return false;
+
+ // llvm.masked.load/store use i32 for alignment while load/store use i64.
+ // That's why we have the alignment limitation.
+ // FIXME: Update the prototype of the intrinsics?
+ return TTI.hasConditionalLoadStoreForType(getLoadStoreType(I)) &&
+ getLoadStoreAlignment(I) < Value::MaximumAlignment;
+}
+
/// Hoist any common code in the successor blocks up into the block. This
/// function guarantees that BB dominates all successors. If EqTermsOnly is
/// given, only perform hoisting in case both blocks only contain a terminator.
/// In that case, only the original BI will be replaced and selects for PHIs are
/// added.
-bool SimplifyCFGOpt::hoistCommonCodeFromSuccessors(BasicBlock *BB,
+bool SimplifyCFGOpt::hoistCommonCodeFromSuccessors(Instruction *TI,
bool EqTermsOnly) {
// This does very trivial matching, with limited scanning, to find identical
// instructions in the two blocks. In particular, we don't want to get into
@@ -1628,6 +1647,7 @@ bool SimplifyCFGOpt::hoistCommonCodeFromSuccessors(BasicBlock *BB,
// such, we currently just scan for obviously identical instructions in an
// identical order, possibly separated by the same number of non-identical
// instructions.
+ BasicBlock *BB = TI->getParent();
unsigned int SuccSize = succ_size(BB);
if (SuccSize < 2)
return false;
@@ -1639,7 +1659,63 @@ bool SimplifyCFGOpt::hoistCommonCodeFromSuccessors(BasicBlock *BB,
if (Succ->hasAddressTaken() || !Succ->getSinglePredecessor())
return false;
- auto *TI = BB->getTerminator();
+ auto *BI = dyn_cast<BranchInst>(TI);
+ if (BI && HoistLoadsStoresWithCondFaulting &&
+ Options.HoistLoadsStoresWithCondFaulting) {
+ SmallVector<Instruction *, 2> SpeculatedConditionalLoadsStores;
+ for (auto *Succ : successors(BB)) {
+ for (Instruction &I : drop_end(*Succ)) {
+ if (!isSafeCheapLoadStore(&I, TTI) ||
+ SpeculatedConditionalLoadsStores.size() ==
+ HoistLoadsStoresWithCondFaultingThreshold)
+ return false;
+ SpeculatedConditionalLoadsStores.push_back(&I);
+ }
+ }
+
+ // TODO: Move below code to a function to share with #96878.
+ if (SpeculatedConditionalLoadsStores.empty())
+ return false;
+
+ auto &Context = BI->getParent()->getContext();
+ auto *VCondTy = FixedVectorType::get(Type::getInt1Ty(Context), 1);
+ auto *Cond = BI->getOperand(0);
+ IRBuilder<> Builder(BI);
+ Value *Mask1 = Builder.CreateBitCast(Cond, VCondTy);
+ Value *Mask0 = Builder.CreateBitCast(
+ Builder.CreateXor(Cond, ConstantInt::getTrue(Context)), VCondTy);
+ for (auto *I : SpeculatedConditionalLoadsStores) {
+ Value *Mask = I->getParent() == BI->getSuccessor(0) ? Mask1 : Mask0;
+ assert(!getLoadStoreType(I)->isVectorTy() && "not implemented");
+ auto *Op0 = I->getOperand(0);
+ Instruction *MaskedLoadStore = nullptr;
+ if (auto *LI = dyn_cast<LoadInst>(I)) {
+ // Handle Load.
+ auto *Ty = I->getType();
+ MaskedLoadStore = Builder.CreateMaskedLoad(FixedVectorType::get(Ty, 1),
+ Op0, LI->getAlign(), Mask);
+ I->replaceAllUsesWith(Builder.CreateBitCast(MaskedLoadStore, Ty));
+ } else {
+ // Handle Store.
+ auto *StoredVal =
+ Builder.CreateBitCast(Op0, FixedVectorType::get(Op0->getType(), 1));
+ MaskedLoadStore = Builder.CreateMaskedStore(
+ StoredVal, I->getOperand(1), cast<StoreInst>(I)->getAlign(), Mask);
+ }
+ I->dropUBImplyingAttrsAndUnknownMetadata(
+ {LLVMContext::MD_range, LLVMContext::MD_annotation});
+ // FIXME: DIAssignID is not supported for masked store yet.
+ // (Verifier::visitDIAssignIDMetadata)
+ at::deleteAssignmentMarkers(I);
+ I->eraseMetadataIf([](unsigned MDKind, MDNode *Node) {
+ return Node->getMetadataID() == Metadata::DIAssignIDKind;
+ });
+ MaskedLoadStore->copyMetadata(*I);
+ I->eraseFromParent();
+ }
+
+ return true;
+ }
// The second of pair is a SkipFlags bitmask.
using SuccIterPair = std::pair<BasicBlock::iterator, unsigned>;
@@ -2998,25 +3074,6 @@ static bool isProfitableToSpeculate(const BranchInst *BI, bool Invert,
return BIEndProb < Likely;
}
-static bool isSafeCheapLoadStore(const Instruction *I,
- const TargetTransformInfo &TTI) {
- // Not handle volatile or atomic.
- if (auto *L = dyn_cast<LoadInst>(I)) {
- if (!L->isSimple())
- return false;
- } else if (auto *S = dyn_cast<StoreInst>(I)) {
- if (!S->isSimple())
- return false;
- } else
- return false;
-
- // llvm.masked.load/store use i32 for alignment while load/store use i64.
- // That's why we have the alignment limitation.
- // FIXME: Update the prototype of the intrinsics?
- return TTI.hasConditionalLoadStoreForType(getLoadStoreType(I)) &&
- getLoadStoreAlignment(I) < Value::MaximumAlignment;
-}
-
/// Speculate a conditional basic block flattening the CFG.
///
/// Note that this is a very risky transform currently. Speculating
@@ -7436,7 +7493,7 @@ bool SimplifyCFGOpt::simplifySwitch(SwitchInst *SI, IRBuilder<> &Builder) {
return requestResimplify();
if (HoistCommon &&
- hoistCommonCodeFromSuccessors(SI->getParent(), !Options.HoistCommonInsts))
+ hoistCommonCodeFromSuccessors(SI, !Options.HoistCommonInsts))
return requestResimplify();
return false;
@@ -7794,8 +7851,8 @@ bool SimplifyCFGOpt::simplifyCondBranch(BranchInst *BI, IRBuilder<> &Builder) {
// can hoist it up to the branching block.
if (BI->getSuccessor(0)->getSinglePredecessor()) {
if (BI->getSuccessor(1)->getSinglePredecessor()) {
- if (HoistCommon && hoistCommonCodeFromSuccessors(
- BI->getParent(), !Options.HoistCommonInsts))
+ if (HoistCommon &&
+ hoistCommonCodeFromSuccessors(BI, !Options.HoistCommonInsts))
return requestResimplify();
} else {
// If Successor #1 has multiple preds, we may be able to conditionally
diff --git a/llvm/test/Transforms/SimplifyCFG/X86/hoist-loads-stores-with-cf.ll b/llvm/test/Transforms/SimplifyCFG/X86/hoist-loads-stores-with-cf.ll
index 047ca717da8009..87ff4ba2af9a41 100644
--- a/llvm/test/Transforms/SimplifyCFG/X86/hoist-loads-stores-with-cf.ll
+++ b/llvm/test/Transforms/SimplifyCFG/X86/hoist-loads-stores-with-cf.ll
@@ -278,21 +278,19 @@ if.false: ; preds = %if.true, %entry
}
;; Both of successor 0 and successor 1 have a single predecessor.
-;; TODO: Support transform for this case.
define void @single_predecessor(ptr %p, ptr %q, i32 %a) {
; CHECK-LABEL: @single_predecessor(
; CHECK-NEXT: entry:
; CHECK-NEXT: [[TOBOOL:%.*]] = icmp ne i32 [[A:%.*]], 0
-; CHECK-NEXT: br i1 [[TOBOOL]], label [[IF_END:%.*]], label [[IF_THEN:%.*]]
-; CHECK: common.ret:
+; CHECK-NEXT: [[TMP0:%.*]] = bitcast i1 [[TOBOOL]] to <1 x i1>
+; CHECK-NEXT: [[TMP1:%.*]] = xor i1 [[TOBOOL]], true
+; CHECK-NEXT: [[TMP2:%.*]] = bitcast i1 [[TMP1]] to <1 x i1>
+; CHECK-NEXT: call void @llvm.masked.store.v1i32.p0(<1 x i32> <i32 1>, ptr [[Q:%.*]], i32 4, <1 x i1> [[TMP0]])
+; CHECK-NEXT: [[TMP3:%.*]] = call <1 x i32> @llvm.masked.load.v1i32.p0(ptr [[Q]], i32 4, <1 x i1> [[TMP2]], <1 x i32> poison)
+; CHECK-NEXT: [[TMP4:%.*]] = bitcast <1 x i32> [[TMP3]] to i32
+; CHECK-NEXT: [[TMP5:%.*]] = bitcast i32 [[TMP4]] to <1 x i32>
+; CHECK-NEXT: call void @llvm.masked.store.v1i32.p0(<1 x i32> [[TMP5]], ptr [[P:%.*]], i32 4, <1 x i1> [[TMP2]])
; CHECK-NEXT: ret void
-; CHECK: if.end:
-; CHECK-NEXT: store i32 1, ptr [[Q:%.*]], align 4
-; CHECK-NEXT: br label [[COMMON_RET:%.*]]
-; CHECK: if.then:
-; CHECK-NEXT: [[TMP0:%.*]] = load i32, ptr [[Q]], align 4
-; CHECK-NEXT: store i32 [[TMP0]], ptr [[P:%.*]], align 4
-; CHECK-NEXT: br label [[COMMON_RET]]
;
entry:
%tobool = icmp ne i32 %a, 0
@@ -674,6 +672,34 @@ if.false:
ret void
}
+define void @diamondCFG(i32 %a, ptr %c, ptr %d) {
+; CHECK-LABEL: @diamondCFG(
+; CHECK-NEXT: entry:
+; CHECK-NEXT: [[TOBOOL_NOT:%.*]] = icmp eq i32 [[A:%.*]], 0
+; CHECK-NEXT: [[TMP0:%.*]] = bitcast i1 [[TOBOOL_NOT]] to <1 x i1>
+; CHECK-NEXT: [[TMP1:%.*]] = xor i1 [[TOBOOL_NOT]], true
+; CHECK-NEXT: [[TMP2:%.*]] = bitcast i1 [[TMP1]] to <1 x i1>
+; CHECK-NEXT: call void @llvm.masked.store.v1i32.p0(<1 x i32> zeroinitializer, ptr [[D:%.*]], i32 4, <1 x i1> [[TMP0]])
+; CHECK-NEXT: [[TMP3:%.*]] = bitcast i32 [[A]] to <1 x i32>
+; CHECK-NEXT: call void @llvm.masked.store.v1i32.p0(<1 x i32> [[TMP3]], ptr [[C:%.*]], i32 4, <1 x i1> [[TMP2]])
+; CHECK-NEXT: ret void
+;
+entry:
+ %tobool.not = icmp eq i32 %a, 0
+ br i1 %tobool.not, label %if.else, label %if.then
+
+if.then: ; preds = %entry
+ store i32 %a, ptr %c, align 4
+ br label %if.end
+
+if.else: ; preds = %entry
+ store i32 0, ptr %d, align 4
+ br label %if.end
+
+if.end: ; preds = %if.else, %if.then
+ ret void
+}
+
declare i32 @read_memory_only() readonly nounwind willreturn speculatable
!llvm.dbg.cu = !{!0}
|
@@ -728,6 +726,34 @@ if.true: | |||
ret i32 %res | |||
} | |||
|
|||
define void @diamondCFG(i32 %a, ptr %c, ptr %d) { |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Probably remove this? This function tests the same thing as single_predecessor
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Done.
; CHECK-NEXT: [[TMP3:%.*]] = call <1 x i32> @llvm.masked.load.v1i32.p0(ptr [[Q]], i32 4, <1 x i1> [[TMP1]], <1 x i32> poison) | ||
; CHECK-NEXT: [[TMP4:%.*]] = bitcast <1 x i32> [[TMP3]] to i32 | ||
; CHECK-NEXT: [[TMP5:%.*]] = bitcast i32 [[TMP4]] to <1 x i32> | ||
; CHECK-NEXT: call void @llvm.masked.store.v1i32.p0(<1 x i32> [[TMP5]], ptr [[P:%.*]], i32 4, <1 x i1> [[TMP1]]) |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
We can use TMP3
here. Two bitcast
s looks redundant.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Done.
*Invert ? Builder.CreateXor(Cond, ConstantInt::getTrue(Context)) : Cond, | ||
VCondTy); | ||
} else { | ||
Mask0 = Builder.CreateBitCast( |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Considering we name the BB as TrueBB
and FalseBB
in the comment, maybe
Mask0
-> MaskFalse
Mask1
-> MaskTrue
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Done.
for (auto *I : SpeculatedConditionalLoadsStores) { | ||
IRBuilder<> Builder(I); | ||
IRBuilder<> Builder(Invert ? I : BI); | ||
if (!Invert) |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
if (!Mask)
looks better?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Done.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Oh, we cannot. The Mask
is not a nullptr
in the second iteration.
Value *Mask = Builder.CreateBitCast( | ||
Invert ? Builder.CreateXor(Cond, ConstantInt::getTrue(Context)) : Cond, | ||
VCondTy); | ||
IRBuilder<> Builder(Invert ? SpeculatedConditionalLoadsStores.back() : BI); |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
It seems we don't need to select the insertion point by condition. Always use BI
? Same for line 1686
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
No, cannot. Because in the triangle case, the SpeculatedConditionalLoadsStores
was pushed in reverse order, we must get the insertion point one by one for it, otherwise, we will get something like
define void @basic(i1 %cond, ptr %b, ptr %p, ptr %q) #0 {
entry:
%0 = bitcast i1 %cond to <1 x i1>
%1 = bitcast i64 %5 to <1 x i64>
call void @llvm.masked.store.v1i64.p0(<1 x i64> %1, ptr %q, i32 8, <1 x i1> %0)
%2 = bitcast i32 %7 to <1 x i32>
call void @llvm.masked.store.v1i32.p0(<1 x i32> %2, ptr %p, i32 4, <1 x i1> %0)
%3 = bitcast i16 %9 to <1 x i16>
call void @llvm.masked.store.v1i16.p0(<1 x i16> %3, ptr %b, i32 2, <1 x i1> %0)
%4 = call <1 x i64> @llvm.masked.load.v1i64.p0(ptr %b, i32 8, <1 x i1> %0, <1 x i64> poison)
%5 = bitcast <1 x i64> %4 to i64
%6 = call <1 x i32> @llvm.masked.load.v1i32.p0(ptr %q, i32 4, <1 x i1> %0, <1 x i32> poison)
%7 = bitcast <1 x i32> %6 to i32
%8 = call <1 x i16> @llvm.masked.load.v1i16.p0(ptr %p, i32 2, <1 x i1> %0, <1 x i16> poison)
%9 = bitcast <1 x i16> %8 to i16
ret void
}
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Given its an optional<bool>
- would it be easier to grok if you used Invert.has_value()
?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Done.
} | ||
} | ||
|
||
if (!SpeculatedConditionalLoadsStores.empty()) |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Looks suspicious here since the following checks are skipped. We should add tests to check what happen if there is non-load/store instructions in the successors.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
The assumption here is prior passes have moved common instructions out of branches. It works in a pipeline, e.g.,
$ cat single_predecessor.ll
define i32 @single_predecessor(ptr %p, ptr %q, i32 %x, i32 %a, i32 %b) {
entry:
%tobool = icmp ne i32 %x, 0
br i1 %tobool, label %if.end, label %if.then
if.end:
store i32 1, ptr %q
%c = add i32 %a, %b ; <== common instruction
ret i32 %c
if.then:
%0 = load i32, ptr %q
store i32 %0, ptr %p
%d = add i32 %a, %b ; <== common instruction
ret i32 %d
}
$ opt -passes=simplifycfg,'instcombine<no-verify-fixpoint>','simplifycfg<hoist-loads-stores-with-cond-faulting>' -mtriple=x86_64 -mattr=+cf single_predecessor.ll -S -o -
; ModuleID = 'single_predecessor.ll'
source_filename = "single_predecessor.ll"
target datalayout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128"
target triple = "x86_64"
define i32 @single_predecessor(ptr %p, ptr %q, i32 %x, i32 %a, i32 %b) #0 {
entry:
%tobool.not = icmp eq i32 %x, 0
%0 = xor i1 %tobool.not, true
%1 = bitcast i1 %0 to <1 x i1>
%2 = bitcast i1 %tobool.not to <1 x i1>
%3 = call <1 x i32> @llvm.masked.load.v1i32.p0(ptr %q, i32 4, <1 x i1> %2, <1 x i32> poison)
%4 = bitcast <1 x i32> %3 to i32
call void @llvm.masked.store.v1i32.p0(<1 x i32> %3, ptr %p, i32 4, <1 x i1> %2)
call void @llvm.masked.store.v1i32.p0(<1 x i32> <i32 1>, ptr %q, i32 4, <1 x i1> %1)
%common.ret.op = add i32 %a, %b
ret i32 %common.ret.op
}
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Considering current implementation separates the common instr hoist from load/store hoist,
probably we should move the code to
if (BI->getSuccessor(0)->getSinglePredecessor()) {
if (BI->getSuccessor(1)->getSinglePredecessor()) {
if (HoistCommon && hoistCommonCodeFromSuccessors(
BI->getParent(), !Options.HoistCommonInsts))
return requestResimplify();
if (HoistLoadsStoresWithCondFaulting &&
Options.HoistLoadsStoresWithCondFaulting && hoistConditionalLoadsStores(...))
return requestResimplify();
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I don't think it's necessary given the last simplifycfg is supposed to clean up single-entry-single-exit or empty blocks.
https://github.com/llvm/llvm-project/blob/main/llvm/lib/Passes/PassBuilderPipelines.cpp#L1533-L1534
If we have such pattens, we should have optimized in previous simplifycfgs:
$ clang -S apx.c -mapxf -mllvm -print-pipeline-passes -O1 | sed -e 's/,/\n/g' | sed -e 's/;no-[^;>]*//g' | grep ^simplifycfg
simplifycfg<bonus-inst-threshold=1;keep-loops;speculate-blocks;simplify-cond-branch>
simplifycfg<bonus-inst-threshold=1;switch-range-to-icmp;keep-loops;speculate-blocks;simplify-cond-branch>)
simplifycfg<bonus-inst-threshold=1;switch-range-to-icmp;keep-loops;speculate-blocks;simplify-cond-branch>
simplifycfg<bonus-inst-threshold=1;switch-range-to-icmp;keep-loops;speculate-blocks;simplify-cond-branch>
simplifycfg<bonus-inst-threshold=1;switch-range-to-icmp;keep-loops;speculate-blocks;simplify-cond-branch>
simplifycfg<bonus-inst-threshold=1;switch-range-to-icmp;keep-loops;speculate-blocks;simplify-cond-branch>
simplifycfg<bonus-inst-threshold=1;forward-switch-cond;switch-range-to-icmp;switch-to-lookup;hoist-common-insts;sink-common-insts;speculate-blocks;simplify-cond-branch>
simplifycfg<bonus-inst-threshold=1;switch-range-to-icmp;keep-loops;hoist-loads-stores-with-cond-faulting;speculate-blocks;simplify-cond-branch;speculate-unpredictables>)
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I didn't mean to clean the empty bb by ourselves. I meant the logic of hoisting load/store does not match the name and comment of hositCommonCodeFromSuccessors
. Probably we should move it out.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Makes sense, done.
|
||
if (!SpeculatedConditionalLoadsStores.empty()) | ||
hoistConditionalLoadsStores(BI, SpeculatedConditionalLoadsStores, | ||
std::nullopt); |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Should we return here?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Yes, good catch!
@KanRobert , with the latest change, the icount and cond_br of all cpu2017 benchmarks are shown as below:
Does the value seem reasonable. There's no failures in both cases. |
✅ With the latest revision this PR passed the C/C++ code formatter. |
E2E tests:
|
if (I.getNumSuccessors() > 1) | ||
return false; |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Add test case for this?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Done.
SpeculatedConditionalLoadsStores.size() == | ||
HoistLoadsStoresWithCondFaultingThreshold) { |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I think we should consider branch probability for this, e.g. isProfitableToSpeculate
. If A has two successors B and C, it's not profitable to execute more instructions to eliminate the branch if the branch is well-predicated and the load/store comes from the unlikely successor.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Good idea, done!
@@ -1725,6 +1744,7 @@ static void hoistConditionalLoadsStores( | |||
MaskedLoadStore->copyMetadata(*I); | |||
I->eraseFromParent(); | |||
} | |||
return true; |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I think this function always return true, should we change it to void?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Done.
Need to update the description, it does not require diamond CFG. |
Done. |
icount and cond_br of all cpu2017 benchmarks with latest change.
|
For cases like this why aren't we folding to:
or the even more general:
|
That's a good question. It doesn't rely on hardware feature, so can be a general branch to select transformation. I'm also wondering why didn't we do it before, especially the first one. Is there any concern on possible sideeffect of memory operand? |
Seems no compiler choose to do so: https://godbolt.org/z/7ahEcd6n8 |
|
||
if (BI && HoistLoadsStoresWithCondFaulting && | ||
Options.HoistLoadsStoresWithCondFaulting && | ||
isProfitableToSpeculate(BI, std::nullopt, TTI)) { |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Hmm, from you code, it seems the hoist can happen only when TWeight = FWeight = 0. ?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Yeah, we need a meaningful ratio here, but we haven't enabled PGO. So let's leave it when we do PGO tuning.
From this perspective, should we implement this optimization w/o any HW feature and do further optimization w/ CFCMOV at backend? |
It seems to be easier to create the masked/conditional store pattern from the store+select pattern rather than the other way around. |
Actually, we have already had this optimization done by SimplifyCFG, we just need to increase NumPHIInsts from 1 to 2:
|
Good knowledge! Then it seems this optimization for this case w/ CFCMOV does not have any value... |
No, they are different optimizations. They just have intersection in the specific example. The existing one do nothing with the test
Yes, we can do that. I'll create one then :) |
|
Friendly ping |
1 similar comment
Friendly ping |
@@ -759,6 +757,43 @@ if.true: | |||
ret i32 %res | |||
} | |||
|
|||
define i32 @multi_successors(i1 %c1, i32 %c2, ptr %p) { |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Is this a negative test? If so, the function name should start with "not", and need some comments. e.g. "not_maximum_alignment" in this file.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Done.
return !SpeculatedConditionalLoadsStores.empty(); | ||
}; | ||
|
||
if (CanSpeculateConditionalLoadsStores()) { |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
It seems the lambda is used once, maybe
bool CanSpeculateConditionalLoadsStores = <your lambda>();
looks better?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
The advantage to use lambda is we can direct break inner loop by return. We have to use goto or more flags if change to non lambda code.
@@ -1664,18 +1664,35 @@ static bool areIdenticalUpToCommutativity(const Instruction *I1, | |||
static void hoistConditionalLoadsStores( | |||
BranchInst *BI, | |||
SmallVectorImpl<Instruction *> &SpeculatedConditionalLoadsStores, | |||
bool Invert) { | |||
std::optional<bool> Invert) { |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Add a comment like
\param Invert ...
?
It's a little hard to know when it's nullopt w/o searching for the caller.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Done.
This is a follow up of #96878 to support hoisting load/store from BBs have the same predecessor, if load/store are the only instructions and the branch is unpredictable, e.g.: