Skip to content

[X86,SimplifyCFG] Support hoisting load/store with conditional faulting (Part II) #108812

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 12 commits into from
Nov 25, 2024
Merged
89 changes: 73 additions & 16 deletions llvm/lib/Transforms/Utils/SimplifyCFG.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -1661,21 +1661,43 @@ static bool areIdenticalUpToCommutativity(const Instruction *I1,
/// \endcode
///
/// So we need to turn hoisted load/store into cload/cstore.
///
/// \param BI The branch instruction.
/// \param SpeculatedConditionalLoadsStores The load/store instructions that
/// will be speculated.
/// \param Invert indicates if speculates FalseBB. Only used in triangle CFG.
static void hoistConditionalLoadsStores(
BranchInst *BI,
SmallVectorImpl<Instruction *> &SpeculatedConditionalLoadsStores,
bool Invert) {
std::optional<bool> Invert) {
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Add a comment like

\param Invert  ...

?

It's a little hard to know when it's nullopt w/o searching for the caller.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Done.

auto &Context = BI->getParent()->getContext();
auto *VCondTy = FixedVectorType::get(Type::getInt1Ty(Context), 1);
auto *Cond = BI->getOperand(0);
// Construct the condition if needed.
BasicBlock *BB = BI->getParent();
IRBuilder<> Builder(SpeculatedConditionalLoadsStores.back());
Value *Mask = Builder.CreateBitCast(
Invert ? Builder.CreateXor(Cond, ConstantInt::getTrue(Context)) : Cond,
VCondTy);
IRBuilder<> Builder(
Invert.has_value() ? SpeculatedConditionalLoadsStores.back() : BI);
Value *Mask = nullptr;
Value *MaskFalse = nullptr;
Value *MaskTrue = nullptr;
if (Invert.has_value()) {
Mask = Builder.CreateBitCast(
*Invert ? Builder.CreateXor(Cond, ConstantInt::getTrue(Context)) : Cond,
VCondTy);
} else {
MaskFalse = Builder.CreateBitCast(
Builder.CreateXor(Cond, ConstantInt::getTrue(Context)), VCondTy);
MaskTrue = Builder.CreateBitCast(Cond, VCondTy);
}
auto PeekThroughBitcasts = [](Value *V) {
while (auto *BitCast = dyn_cast<BitCastInst>(V))
V = BitCast->getOperand(0);
return V;
};
for (auto *I : SpeculatedConditionalLoadsStores) {
IRBuilder<> Builder(I);
IRBuilder<> Builder(Invert.has_value() ? I : BI);
if (!Invert.has_value())
Mask = I->getParent() == BI->getSuccessor(0) ? MaskTrue : MaskFalse;
// We currently assume conditional faulting load/store is supported for
// scalar types only when creating new instructions. This can be easily
// extended for vector types in the future.
Expand All @@ -1687,12 +1709,14 @@ static void hoistConditionalLoadsStores(
auto *Ty = I->getType();
PHINode *PN = nullptr;
Value *PassThru = nullptr;
for (User *U : I->users())
if ((PN = dyn_cast<PHINode>(U))) {
PassThru = Builder.CreateBitCast(PN->getIncomingValueForBlock(BB),
FixedVectorType::get(Ty, 1));
break;
}
if (Invert.has_value())
for (User *U : I->users())
if ((PN = dyn_cast<PHINode>(U))) {
PassThru = Builder.CreateBitCast(
PeekThroughBitcasts(PN->getIncomingValueForBlock(BB)),
FixedVectorType::get(Ty, 1));
break;
}
MaskedLoadStore = Builder.CreateMaskedLoad(
FixedVectorType::get(Ty, 1), Op0, LI->getAlign(), Mask, PassThru);
Value *NewLoadStore = Builder.CreateBitCast(MaskedLoadStore, Ty);
Expand All @@ -1701,8 +1725,8 @@ static void hoistConditionalLoadsStores(
I->replaceAllUsesWith(NewLoadStore);
} else {
// Handle Store.
auto *StoredVal =
Builder.CreateBitCast(Op0, FixedVectorType::get(Op0->getType(), 1));
auto *StoredVal = Builder.CreateBitCast(
PeekThroughBitcasts(Op0), FixedVectorType::get(Op0->getType(), 1));
MaskedLoadStore = Builder.CreateMaskedStore(
StoredVal, I->getOperand(1), cast<StoreInst>(I)->getAlign(), Mask);
}
Expand Down Expand Up @@ -3151,7 +3175,8 @@ static bool validateAndCostRequiredSelects(BasicBlock *BB, BasicBlock *ThenBB,
return HaveRewritablePHIs;
}

static bool isProfitableToSpeculate(const BranchInst *BI, bool Invert,
static bool isProfitableToSpeculate(const BranchInst *BI,
std::optional<bool> Invert,
const TargetTransformInfo &TTI) {
// If the branch is non-unpredictable, and is predicted to *not* branch to
// the `then` block, then avoid speculating it.
Expand All @@ -3162,7 +3187,10 @@ static bool isProfitableToSpeculate(const BranchInst *BI, bool Invert,
if (!extractBranchWeights(*BI, TWeight, FWeight) || (TWeight + FWeight) == 0)
return true;

uint64_t EndWeight = Invert ? TWeight : FWeight;
if (!Invert.has_value())
return false;

uint64_t EndWeight = *Invert ? TWeight : FWeight;
BranchProbability BIEndProb =
BranchProbability::getBranchProbability(EndWeight, TWeight + FWeight);
BranchProbability Likely = TTI.getPredictableBranchThreshold();
Expand Down Expand Up @@ -7854,6 +7882,35 @@ bool SimplifyCFGOpt::simplifyCondBranch(BranchInst *BI, IRBuilder<> &Builder) {
if (HoistCommon &&
hoistCommonCodeFromSuccessors(BI, !Options.HoistCommonInsts))
return requestResimplify();

if (BI && HoistLoadsStoresWithCondFaulting &&
Options.HoistLoadsStoresWithCondFaulting &&
isProfitableToSpeculate(BI, std::nullopt, TTI)) {
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Hmm, from you code, it seems the hoist can happen only when TWeight = FWeight = 0. ?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Yeah, we need a meaningful ratio here, but we haven't enabled PGO. So let's leave it when we do PGO tuning.

SmallVector<Instruction *, 2> SpeculatedConditionalLoadsStores;
auto CanSpeculateConditionalLoadsStores = [&]() {
for (auto *Succ : successors(BB)) {
for (Instruction &I : *Succ) {
if (I.isTerminator()) {
if (I.getNumSuccessors() > 1)
return false;
Comment on lines +7894 to +7895
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Add test case for this?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Done.

continue;
} else if (!isSafeCheapLoadStore(&I, TTI) ||
SpeculatedConditionalLoadsStores.size() ==
HoistLoadsStoresWithCondFaultingThreshold) {
Comment on lines +7898 to +7899
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I think we should consider branch probability for this, e.g. isProfitableToSpeculate. If A has two successors B and C, it's not profitable to execute more instructions to eliminate the branch if the branch is well-predicated and the load/store comes from the unlikely successor.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Good idea, done!

return false;
}
SpeculatedConditionalLoadsStores.push_back(&I);
}
}
return !SpeculatedConditionalLoadsStores.empty();
};

if (CanSpeculateConditionalLoadsStores()) {
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

It seems the lambda is used once, maybe

bool CanSpeculateConditionalLoadsStores = <your lambda>(); 

looks better?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The advantage to use lambda is we can direct break inner loop by return. We have to use goto or more flags if change to non lambda code.

hoistConditionalLoadsStores(BI, SpeculatedConditionalLoadsStores,
std::nullopt);
return requestResimplify();
}
}
} else {
// If Successor #1 has multiple preds, we may be able to conditionally
// execute Successor #0 if it branches to Successor #1.
Expand Down
64 changes: 50 additions & 14 deletions llvm/test/Transforms/SimplifyCFG/X86/hoist-loads-stores-with-cf.ll
Original file line number Diff line number Diff line change
Expand Up @@ -276,34 +276,32 @@ if.false: ; preds = %if.true, %entry
}

;; Both of successor 0 and successor 1 have a single predecessor.
;; TODO: Support transform for this case.
define void @single_predecessor(ptr %p, ptr %q, i32 %a) {
define i32 @single_predecessor(ptr %p, ptr %q, i32 %a) {
; CHECK-LABEL: @single_predecessor(
; CHECK-NEXT: entry:
; CHECK-NEXT: [[TOBOOL:%.*]] = icmp ne i32 [[A:%.*]], 0
; CHECK-NEXT: br i1 [[TOBOOL]], label [[IF_END:%.*]], label [[IF_THEN:%.*]]
; CHECK: common.ret:
; CHECK-NEXT: ret void
; CHECK: if.end:
; CHECK-NEXT: store i32 1, ptr [[Q:%.*]], align 4
; CHECK-NEXT: br label [[COMMON_RET:%.*]]
; CHECK: if.then:
; CHECK-NEXT: [[TMP0:%.*]] = load i32, ptr [[Q]], align 4
; CHECK-NEXT: store i32 [[TMP0]], ptr [[P:%.*]], align 4
; CHECK-NEXT: br label [[COMMON_RET]]
; CHECK-NEXT: [[TMP0:%.*]] = xor i1 [[TOBOOL]], true
; CHECK-NEXT: [[TMP1:%.*]] = bitcast i1 [[TMP0]] to <1 x i1>
; CHECK-NEXT: [[TMP2:%.*]] = bitcast i1 [[TOBOOL]] to <1 x i1>
; CHECK-NEXT: call void @llvm.masked.store.v1i32.p0(<1 x i32> splat (i32 1), ptr [[Q:%.*]], i32 4, <1 x i1> [[TMP2]])
; CHECK-NEXT: [[TMP3:%.*]] = call <1 x i32> @llvm.masked.load.v1i32.p0(ptr [[Q]], i32 4, <1 x i1> [[TMP1]], <1 x i32> poison)
; CHECK-NEXT: [[TMP4:%.*]] = bitcast <1 x i32> [[TMP3]] to i32
; CHECK-NEXT: call void @llvm.masked.store.v1i32.p0(<1 x i32> [[TMP3]], ptr [[P:%.*]], i32 4, <1 x i1> [[TMP1]])
; CHECK-NEXT: [[DOT:%.*]] = select i1 [[TOBOOL]], i32 2, i32 3
; CHECK-NEXT: ret i32 [[DOT]]
;
entry:
%tobool = icmp ne i32 %a, 0
br i1 %tobool, label %if.end, label %if.then

if.end:
store i32 1, ptr %q
ret void
ret i32 2

if.then:
%0 = load i32, ptr %q
store i32 %0, ptr %p
ret void
ret i32 3
}

;; Hoist 6 stores.
Expand Down Expand Up @@ -759,6 +757,44 @@ if.true:
ret i32 %res
}

;; Not transform if either BB has multiple successors.
define i32 @not_multi_successors(i1 %c1, i32 %c2, ptr %p) {
; CHECK-LABEL: @not_multi_successors(
; CHECK-NEXT: entry:
; CHECK-NEXT: br i1 [[C1:%.*]], label [[ENTRY_IF:%.*]], label [[COMMON_RET:%.*]]
; CHECK: entry.if:
; CHECK-NEXT: [[VAL:%.*]] = load i32, ptr [[P:%.*]], align 4
; CHECK-NEXT: switch i32 [[C2:%.*]], label [[COMMON_RET]] [
; CHECK-NEXT: i32 0, label [[SW_BB:%.*]]
; CHECK-NEXT: i32 1, label [[SW_BB]]
; CHECK-NEXT: ]
; CHECK: common.ret:
; CHECK-NEXT: [[COMMON_RET_OP:%.*]] = phi i32 [ 0, [[ENTRY:%.*]] ], [ [[VAL]], [[ENTRY_IF]] ], [ 0, [[SW_BB]] ]
; CHECK-NEXT: ret i32 [[COMMON_RET_OP]]
; CHECK: sw.bb:
; CHECK-NEXT: br label [[COMMON_RET]]
;
entry:
br i1 %c1, label %entry.if, label %entry.else

entry.if: ; preds = %entry
%val = load i32, ptr %p, align 4
switch i32 %c2, label %return [
i32 0, label %sw.bb
i32 1, label %sw.bb
]

entry.else: ; preds = %entry
ret i32 0

sw.bb: ; preds = %entry.if, %entry.if
br label %return

return: ; preds = %sw.bb, %entry.if
%ret = phi i32 [ %val, %entry.if ], [ 0, %sw.bb ]
ret i32 %ret
}

declare i32 @read_memory_only() readonly nounwind willreturn speculatable

!llvm.dbg.cu = !{!0}
Expand Down