Skip to content

[X86,SimplifyCFG] Support hoisting load/store with conditional faulting (Part I) #96878

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 19 commits into from
Aug 29, 2024
Merged
Show file tree
Hide file tree
Changes from 11 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
10 changes: 10 additions & 0 deletions llvm/include/llvm/Transforms/Utils/SimplifyCFGOptions.h
Original file line number Diff line number Diff line change
Expand Up @@ -27,10 +27,12 @@ struct SimplifyCFGOptions {
bool ConvertSwitchToLookupTable = false;
bool NeedCanonicalLoop = true;
bool HoistCommonInsts = false;
bool HoistLoadsStoresWithCondFaulting = false;
bool SinkCommonInsts = false;
bool SimplifyCondBranch = true;
bool SpeculateBlocks = true;
bool SpeculateUnpredictables = false;
bool RunInCodeGen = false;

AssumptionCache *AC = nullptr;

Expand Down Expand Up @@ -59,6 +61,10 @@ struct SimplifyCFGOptions {
HoistCommonInsts = B;
return *this;
}
SimplifyCFGOptions &hoistLoadsStoresWithCondFaulting(bool B) {
HoistLoadsStoresWithCondFaulting = B;
return *this;
}
SimplifyCFGOptions &sinkCommonInsts(bool B) {
SinkCommonInsts = B;
return *this;
Expand All @@ -80,6 +86,10 @@ struct SimplifyCFGOptions {
SpeculateUnpredictables = B;
return *this;
}
SimplifyCFGOptions &runInCodeGen(bool B) {
RunInCodeGen = B;
return *this;
}
};

} // namespace llvm
Expand Down
8 changes: 8 additions & 0 deletions llvm/lib/CodeGen/TargetPassConfig.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -47,6 +47,7 @@
#include "llvm/Target/CGPassBuilderOption.h"
#include "llvm/Target/TargetMachine.h"
#include "llvm/Transforms/Scalar.h"
#include "llvm/Transforms/Scalar/SimplifyCFG.h"
#include "llvm/Transforms/Utils.h"
#include <cassert>
#include <optional>
Expand Down Expand Up @@ -852,6 +853,13 @@ void TargetPassConfig::addIRPasses() {
!DisableAtExitBasedGlobalDtorLowering)
addPass(createLowerGlobalDtorsLegacyPass());

// Hoist loads/stores to reduce branches when profitable.
if (getOptLevel() != CodeGenOptLevel::None)
addPass(createCFGSimplificationPass(
SimplifyCFGOptions()
.runInCodeGen(true)
.hoistLoadsStoresWithCondFaulting(true)));

// Make sure that no unreachable blocks are instruction selected.
addPass(createUnreachableBlockEliminationPass());

Expand Down
2 changes: 2 additions & 0 deletions llvm/lib/Passes/PassBuilder.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -843,6 +843,8 @@ Expected<SimplifyCFGOptions> parseSimplifyCFGOptions(StringRef Params) {
Result.needCanonicalLoops(Enable);
} else if (ParamName == "hoist-common-insts") {
Result.hoistCommonInsts(Enable);
} else if (ParamName == "hoist-loads-stores-with-cond-faulting") {
Result.hoistLoadsStoresWithCondFaulting(Enable);
} else if (ParamName == "sink-common-insts") {
Result.sinkCommonInsts(Enable);
} else if (ParamName == "speculate-unpredictables") {
Expand Down
27 changes: 27 additions & 0 deletions llvm/lib/Transforms/Scalar/SimplifyCFGPass.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -73,6 +73,11 @@ static cl::opt<bool> UserHoistCommonInsts(
"hoist-common-insts", cl::Hidden, cl::init(false),
cl::desc("hoist common instructions (default = false)"));

static cl::opt<bool> UserHoistLoadsStoresWithCondFaulting(
"hoist-loads-stores-with-cond-faulting", cl::Hidden, cl::init(false),
cl::desc("Hoist loads/stores if the target supports conditional faulting "
"(default = false)"));

static cl::opt<bool> UserSinkCommonInsts(
"sink-common-insts", cl::Hidden, cl::init(false),
cl::desc("Sink common instructions (default = false)"));
Expand Down Expand Up @@ -272,6 +277,23 @@ static bool simplifyFunctionCFGImpl(Function &F, const TargetTransformInfo &TTI,
const SimplifyCFGOptions &Options) {
DomTreeUpdater DTU(DT, DomTreeUpdater::UpdateStrategy::Eager);

// In codegen, we use unreachableblockelim to remove dead blocks b/c it
// doesn't really have any well defined semantics for unreachable code
// (see UnreachableBlockElim.cpp and LLVM::CodeGen/X86/GC/ocaml-gc-assert.ll
// for details).
//
// The first character is not capitalized to reduce the code diff FTTB.
auto removeUnreachableBlocks = [&](Function &F, DomTreeUpdater *DTU = nullptr,
MemorySSAUpdater *MSSAU = nullptr) {
return !Options.RunInCodeGen &&
llvm::removeUnreachableBlocks(F, DTU, MSSAU);
};

// Only run simplifycfg in codegen if we'd like to hoist loads/stores.
if (Options.RunInCodeGen && (!Options.HoistLoadsStoresWithCondFaulting ||
!TTI.hasConditionalLoadStoreForType()))
return false;

bool EverChanged = removeUnreachableBlocks(F, DT ? &DTU : nullptr);
EverChanged |=
tailMergeBlocksWithSimilarFunctionTerminators(F, DT ? &DTU : nullptr);
Expand Down Expand Up @@ -326,6 +348,9 @@ static void applyCommandLineOverridesToOptions(SimplifyCFGOptions &Options) {
Options.NeedCanonicalLoop = UserKeepLoops;
if (UserHoistCommonInsts.getNumOccurrences())
Options.HoistCommonInsts = UserHoistCommonInsts;
if (UserHoistLoadsStoresWithCondFaulting.getNumOccurrences())
Options.HoistLoadsStoresWithCondFaulting =
UserHoistLoadsStoresWithCondFaulting;
if (UserSinkCommonInsts.getNumOccurrences())
Options.SinkCommonInsts = UserSinkCommonInsts;
if (UserSpeculateUnpredictables.getNumOccurrences())
Expand Down Expand Up @@ -354,6 +379,8 @@ void SimplifyCFGPass::printPipeline(
<< "switch-to-lookup;";
OS << (Options.NeedCanonicalLoop ? "" : "no-") << "keep-loops;";
OS << (Options.HoistCommonInsts ? "" : "no-") << "hoist-common-insts;";
OS << (Options.HoistLoadsStoresWithCondFaulting ? "" : "no-")
<< "hoist-loads-stores-with-cond-faulting;";
OS << (Options.SinkCommonInsts ? "" : "no-") << "sink-common-insts;";
OS << (Options.SpeculateBlocks ? "" : "no-") << "speculate-blocks;";
OS << (Options.SimplifyCondBranch ? "" : "no-") << "simplify-cond-branch;";
Expand Down
143 changes: 134 additions & 9 deletions llvm/lib/Transforms/Utils/SimplifyCFG.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -117,6 +117,12 @@ static cl::opt<bool>
HoistCommon("simplifycfg-hoist-common", cl::Hidden, cl::init(true),
cl::desc("Hoist common instructions up to the parent block"));

static cl::opt<bool> HoistLoadsStoresWithCondFaulting(
"simplifycfg-hoist-loads-stores-with-cond-faulting", cl::Hidden,
cl::init(true),
cl::desc("Hoist loads/stores if the target supports "
"conditional faulting"));

static cl::opt<unsigned>
HoistCommonSkipLimit("simplifycfg-hoist-common-skip-limit", cl::Hidden,
cl::init(20),
Expand Down Expand Up @@ -2978,6 +2984,12 @@ static bool isProfitableToSpeculate(const BranchInst *BI, bool Invert,
return BIEndProb < Likely;
}

static bool isSafeCheapLoadStore(const Instruction &I,
const TargetTransformInfo &TTI) {
return (isa<LoadInst>(I) || isa<StoreInst>(I)) &&
TTI.hasConditionalLoadStoreForType(getLoadStoreType(&I));
}

/// Speculate a conditional basic block flattening the CFG.
///
/// Note that this is a very risky transform currently. Speculating
Expand Down Expand Up @@ -3052,6 +3064,9 @@ bool SimplifyCFGOpt::speculativelyExecuteBB(BranchInst *BI,
SmallVector<Instruction *, 4> SpeculatedDbgIntrinsics;

unsigned SpeculatedInstructions = 0;
bool HositLoadsStores = HoistLoadsStoresWithCondFaulting &&
Options.HoistLoadsStoresWithCondFaulting;
SmallVector<Instruction *, 2> SpeculatedConditionalLoadsStores;
Value *SpeculatedStoreValue = nullptr;
StoreInst *SpeculatedStore = nullptr;
EphemeralValueTracker EphTracker;
Expand Down Expand Up @@ -3080,22 +3095,31 @@ bool SimplifyCFGOpt::speculativelyExecuteBB(BranchInst *BI,

// Only speculatively execute a single instruction (not counting the
// terminator) for now.
++SpeculatedInstructions;
bool IsSafeCheapLoadStore =
HositLoadsStores && isSafeCheapLoadStore(I, TTI);
// Not count load/store into cost if target supports conditional faulting
// b/c it's cheap to speculate it.
if (IsSafeCheapLoadStore)
SpeculatedConditionalLoadsStores.push_back(&I);
else
++SpeculatedInstructions;

if (SpeculatedInstructions > 1)
return false;

// Don't hoist the instruction if it's unsafe or expensive.
if (!isSafeToSpeculativelyExecute(&I) &&
!(HoistCondStores && (SpeculatedStoreValue = isSafeToSpeculateStore(
&I, BB, ThenBB, EndBB))))
if (!IsSafeCheapLoadStore && !isSafeToSpeculativelyExecute(&I) &&
!(HoistCondStores && !SpeculatedStoreValue &&
(SpeculatedStoreValue =
isSafeToSpeculateStore(&I, BB, ThenBB, EndBB))))
return false;
if (!SpeculatedStoreValue &&
if (!IsSafeCheapLoadStore && !SpeculatedStoreValue &&
computeSpeculationCost(&I, TTI) >
PHINodeFoldingThreshold * TargetTransformInfo::TCC_Basic)
return false;

// Store the store speculation candidate.
if (SpeculatedStoreValue)
if (!SpeculatedStore && SpeculatedStoreValue)
SpeculatedStore = cast<StoreInst>(&I);

// Do not hoist the instruction if any of its operands are defined but not
Expand All @@ -3122,11 +3146,11 @@ bool SimplifyCFGOpt::speculativelyExecuteBB(BranchInst *BI,

// Check that we can insert the selects and that it's not too expensive to do
// so.
bool Convert = SpeculatedStore != nullptr;
bool Convert =
SpeculatedStore != nullptr || !SpeculatedConditionalLoadsStores.empty();
InstructionCost Cost = 0;
Convert |= validateAndCostRequiredSelects(BB, ThenBB, EndBB,
SpeculatedInstructions,
Cost, TTI);
SpeculatedInstructions, Cost, TTI);
if (!Convert || Cost > Budget)
return false;

Expand Down Expand Up @@ -3214,6 +3238,107 @@ bool SimplifyCFGOpt::speculativelyExecuteBB(BranchInst *BI,
BB->splice(BI->getIterator(), ThenBB, ThenBB->begin(),
std::prev(ThenBB->end()));

// If the target supports conditional faulting,
// we look for the following pattern:
// \code
// BB:
// ...
// %cond = icmp ult %x, %y
// br i1 %cond, label %TrueBB, label %FalseBB
// FalseBB:
// store i32 1, ptr %q, align 4
// ...
// TrueBB:
// %maskedloadstore = load i32, ptr %b, align 4
// store i32 %maskedloadstore, ptr %p, align 4
// ...
// \endcode
//
// and transform it into:
//
// \code
// BB:
// ...
// %cond = icmp ult %x, %y
// %maskedloadstore = cload i32, ptr %b, %cond
// cstore i32 %maskedloadstore, ptr %p, %cond
// cstore i32 1, ptr %q, ~%cond
// br i1 %cond, label %TrueBB, label %FalseBB
// FalseBB:
// ...
// TrueBB:
// ...
// \endcode
//
// where cload/cstore are represented by llvm.masked.load/store intrinsics,
// e.g.
//
// \code
// %vcond = bitcast i1 %cond to <1 x i1>
// %v0 = call <1 x i32> @llvm.masked.load.v1i32.p0
// (ptr %b, i32 4, <1 x i1> %vcond, <1 x i32> poison)
// %maskedloadstore = bitcast <1 x i32> %v0 to i32
// call void @llvm.masked.store.v1i32.p0
// (<1 x i32> %v0, ptr %p, i32 4, <1 x i1> %vcond)
// %cond.not = xor i1 %cond, true
// %vcond.not = bitcast i1 %cond.not to <1 x i>
// call void @llvm.masked.store.v1i32.p0
// (<1 x i32> <i32 1>, ptr %q, i32 4, <1x i1> %vcond.not)
// \endcode
//
// So we need to turn hoisted load/store into cload/cstore.
auto &Context = BI->getParent()->getContext();
auto *VCondTy = FixedVectorType::get(Type::getInt1Ty(Context), 1);
auto *Cond = BI->getOperand(0);
Value *Mask = nullptr;
// Construct the condition if needed.
if (!SpeculatedConditionalLoadsStores.empty()) {
IRBuilder<> Builder(SpeculatedConditionalLoadsStores.back());
if (Invert)
Mask = Builder.CreateBitCast(
Builder.CreateXor(Cond, ConstantInt::getTrue(Context)), VCondTy);
else
Mask = Builder.CreateBitCast(Cond, VCondTy);
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Suggested change
if (Invert)
Mask = Builder.CreateBitCast(
Builder.CreateXor(Cond, ConstantInt::getTrue(Context)), VCondTy);
else
Mask = Builder.CreateBitCast(Cond, VCondTy);
Mask = Builder.CreateBitCast(Invert ? Builder.CreateXor(Cond, ConstantInt::getTrue(Context)) : Cond, VCondTy);

Is it possible to avoid creating a null pointer?

Copy link
Contributor Author

@KanRobert KanRobert Aug 15, 2024

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Done.

The variable Mask is a loop invariant and we definitely want to define it outside the loop below. And the Mask instruction should be only created when the candidates for CLOAD/CSTORE is not empty. So we have to assign the value inside the if clause while declaring it outside the if clause.

Of course, we can keep Mask uninitialized at the declaration site w/o affecting the correctness b/c it's never used if the CLOAD/CSTORE optimization can not be done. But I think it's not worthy b/c it does not follow the best practice and nullptr initialization should not bring any cost indeed.

}
for (auto *I : SpeculatedConditionalLoadsStores) {
IRBuilder<> Builder(I);
// We currently assume conditional faulting load/store is supported for
// scalar types only when creating new instructions. This can be easily
// extended for vector types in the future.
assert(!getLoadStoreType(I)->isVectorTy() && "not implemented");
auto *Op0 = I->getOperand(0);
Instruction *MaskedLoadStore = nullptr;
if (auto *LI = dyn_cast<LoadInst>(I)) {
// Handle Load.
auto *Ty = I->getType();
MaskedLoadStore = Builder.CreateMaskedLoad(FixedVectorType::get(Ty, 1),
Op0, LI->getAlign(), Mask);
I->replaceAllUsesWith(Builder.CreateBitCast(MaskedLoadStore, Ty));
} else {
// Handle Store.
auto *StoredVal =
Builder.CreateBitCast(Op0, FixedVectorType::get(Op0->getType(), 1));
MaskedLoadStore = Builder.CreateMaskedStore(
StoredVal, I->getOperand(1), cast<StoreInst>(I)->getAlign(), Mask);
}
// For non-debug metadata, only !annotation, !range, !nonnull and !align are
// kept when hoisting (see Instruction::dropUBImplyingAttrsAndMetadata).
//
// !nonnull, !align : Not support pointer type, no need to keep.
// !range: load type is changed from scalar to vector, no idea how to keep
// it.
// !annotation: Not impact semantics, keep it.
I->dropUBImplyingAttrsAndUnknownMetadata({LLVMContext::MD_annotation});
// FIXME: DIAssignID is not supported for masked store yet.
// (Verifier::visitDIAssignIDMetadata)
at::deleteAssignmentMarkers(I);
I->eraseMetadataIf([](unsigned MDKind, MDNode *Node) {
return Node->getMetadataID() == Metadata::DIAssignIDKind;
});
MaskedLoadStore->copyMetadata(*I);
I->eraseFromParent();
}

// Insert selects and rewrite the PHI operands.
IRBuilder<NoFolder> Builder(BI);
for (PHINode &PN : EndBB->phis()) {
Expand Down
2 changes: 2 additions & 0 deletions llvm/test/CodeGen/AArch64/O3-pipeline.ll
Original file line number Diff line number Diff line change
Expand Up @@ -52,7 +52,9 @@
; CHECK-NEXT: Expand memcmp() to load/stores
; CHECK-NEXT: Lower Garbage Collection Instructions
; CHECK-NEXT: Shadow Stack GC Lowering
; CHECK-NEXT: Simplify the CFG
; CHECK-NEXT: Remove unreachable blocks from the CFG
; CHECK-NEXT: Dominator Tree Construction
; CHECK-NEXT: Natural Loop Information
; CHECK-NEXT: Post-Dominator Tree Construction
; CHECK-NEXT: Branch Probability Analysis
Expand Down
8 changes: 8 additions & 0 deletions llvm/test/CodeGen/AMDGPU/llc-pipeline.ll
Original file line number Diff line number Diff line change
Expand Up @@ -214,7 +214,9 @@
; GCN-O1-NEXT: Lazy Branch Probability Analysis
; GCN-O1-NEXT: Lazy Block Frequency Analysis
; GCN-O1-NEXT: Expand memcmp() to load/stores
; GCN-O1-NEXT: Simplify the CFG
; GCN-O1-NEXT: Remove unreachable blocks from the CFG
; GCN-O1-NEXT: Dominator Tree Construction
; GCN-O1-NEXT: Natural Loop Information
; GCN-O1-NEXT: Post-Dominator Tree Construction
; GCN-O1-NEXT: Branch Probability Analysis
Expand Down Expand Up @@ -500,7 +502,9 @@
; GCN-O1-OPTS-NEXT: Lazy Branch Probability Analysis
; GCN-O1-OPTS-NEXT: Lazy Block Frequency Analysis
; GCN-O1-OPTS-NEXT: Expand memcmp() to load/stores
; GCN-O1-OPTS-NEXT: Simplify the CFG
; GCN-O1-OPTS-NEXT: Remove unreachable blocks from the CFG
; GCN-O1-OPTS-NEXT: Dominator Tree Construction
; GCN-O1-OPTS-NEXT: Natural Loop Information
; GCN-O1-OPTS-NEXT: Post-Dominator Tree Construction
; GCN-O1-OPTS-NEXT: Branch Probability Analysis
Expand Down Expand Up @@ -805,7 +809,9 @@
; GCN-O2-NEXT: Lazy Branch Probability Analysis
; GCN-O2-NEXT: Lazy Block Frequency Analysis
; GCN-O2-NEXT: Expand memcmp() to load/stores
; GCN-O2-NEXT: Simplify the CFG
; GCN-O2-NEXT: Remove unreachable blocks from the CFG
; GCN-O2-NEXT: Dominator Tree Construction
; GCN-O2-NEXT: Natural Loop Information
; GCN-O2-NEXT: Post-Dominator Tree Construction
; GCN-O2-NEXT: Branch Probability Analysis
Expand Down Expand Up @@ -1118,7 +1124,9 @@
; GCN-O3-NEXT: Lazy Branch Probability Analysis
; GCN-O3-NEXT: Lazy Block Frequency Analysis
; GCN-O3-NEXT: Expand memcmp() to load/stores
; GCN-O3-NEXT: Simplify the CFG
; GCN-O3-NEXT: Remove unreachable blocks from the CFG
; GCN-O3-NEXT: Dominator Tree Construction
; GCN-O3-NEXT: Natural Loop Information
; GCN-O3-NEXT: Post-Dominator Tree Construction
; GCN-O3-NEXT: Branch Probability Analysis
Expand Down
2 changes: 2 additions & 0 deletions llvm/test/CodeGen/ARM/O3-pipeline.ll
Original file line number Diff line number Diff line change
Expand Up @@ -30,7 +30,9 @@
; CHECK-NEXT: Expand memcmp() to load/stores
; CHECK-NEXT: Lower Garbage Collection Instructions
; CHECK-NEXT: Shadow Stack GC Lowering
; CHECK-NEXT: Simplify the CFG
; CHECK-NEXT: Remove unreachable blocks from the CFG
; CHECK-NEXT: Dominator Tree Construction
; CHECK-NEXT: Natural Loop Information
; CHECK-NEXT: Post-Dominator Tree Construction
; CHECK-NEXT: Branch Probability Analysis
Expand Down
2 changes: 2 additions & 0 deletions llvm/test/CodeGen/LoongArch/opt-pipeline.ll
Original file line number Diff line number Diff line change
Expand Up @@ -53,7 +53,9 @@
; LAXX-NEXT: Expand memcmp() to load/stores
; LAXX-NEXT: Lower Garbage Collection Instructions
; LAXX-NEXT: Shadow Stack GC Lowering
; LAXX-NEXT: Simplify the CFG
; LAXX-NEXT: Remove unreachable blocks from the CFG
; LAXX-NEXT: Dominator Tree Construction
; LAXX-NEXT: Natural Loop Information
; LAXX-NEXT: Post-Dominator Tree Construction
; LAXX-NEXT: Branch Probability Analysis
Expand Down
2 changes: 2 additions & 0 deletions llvm/test/CodeGen/PowerPC/O3-pipeline.ll
Original file line number Diff line number Diff line change
Expand Up @@ -54,7 +54,9 @@
; CHECK-NEXT: Expand memcmp() to load/stores
; CHECK-NEXT: Lower Garbage Collection Instructions
; CHECK-NEXT: Shadow Stack GC Lowering
; CHECK-NEXT: Simplify the CFG
; CHECK-NEXT: Remove unreachable blocks from the CFG
; CHECK-NEXT: Dominator Tree Construction
; CHECK-NEXT: Natural Loop Information
; CHECK-NEXT: Post-Dominator Tree Construction
; CHECK-NEXT: Branch Probability Analysis
Expand Down
Loading