Skip to content

Commit 175d297

Browse files
authored
[LoopUnroll] Add CSE to remove redundant loads after unrolling. (#83860)
This patch adds loadCSE support to simplifyLoopAfterUnroll. It is based on EarlyCSE's implementation using ScopeHashTable and is using SCEV for accessed pointers to check to find redundant loads after unrolling. This applies to the late unroll pass only, for full unrolling those redundant loads will be cleaned up by the regular pipeline. The current approach constructs MSSA on-demand per-loop, but there is still small but notable compile-time impact: stage1-O3 +0.04% stage1-ReleaseThinLTO +0.06% stage1-ReleaseLTO-g +0.05% stage1-O0-g +0.02% stage2-O3 +0.09% stage2-O0-g +0.04% stage2-clang +0.02% https://llvm-compile-time-tracker.com/compare.php?from=c089fa5a729e217d0c0d4647656386dac1a1b135&to=ec7c0f27cb5c12b600d9adfc8543d131765ec7be&stat=instructions:u This benefits some workloads with runtime-unrolling disabled, where users use pragmas to force unrolling, as well as with runtime unrolling enabled. On SPEC/MultiSource, this removes a number of loads after unrolling on AArch64 with runtime unrolling enabled. ``` External/S...te/526.blender_r/526.blender_r 96 MultiSourc...rks/mediabench/gsm/toast/toast 39 SingleSource/Benchmarks/Misc/ffbench 4 External/SPEC/CINT2006/403.gcc/403.gcc 18 MultiSourc.../Applications/JM/ldecod/ldecod 4 MultiSourc.../mediabench/jpeg/jpeg-6a/cjpeg 6 MultiSourc...OE-ProxyApps-C/miniGMG/miniGMG 9 MultiSourc...e/Applications/ClamAV/clamscan 4 MultiSourc.../MallocBench/espresso/espresso 3 MultiSourc...dence-flt/LinearDependence-flt 2 MultiSourc...ch/office-ispell/office-ispell 4 MultiSourc...ch/consumer-jpeg/consumer-jpeg 6 MultiSourc...ench/security-sha/security-sha 11 MultiSourc...chmarks/McCat/04-bisect/bisect 3 SingleSour...tTests/2020-01-06-coverage-009 12 MultiSourc...ench/telecomm-gsm/telecomm-gsm 39 MultiSourc...lds-flt/CrossingThresholds-flt 24 MultiSourc...dence-dbl/LinearDependence-dbl 2 External/S...C/CINT2006/445.gobmk/445.gobmk 6 MultiSourc...enchmarks/mafft/pairlocalalign 53 External/S...31.deepsjeng_r/531.deepsjeng_r 3 External/S...rate/510.parest_r/510.parest_r 58 External/S...NT2006/464.h264ref/464.h264ref 29 External/S...NT2017rate/502.gcc_r/502.gcc_r 45 External/S...C/CINT2006/456.hmmer/456.hmmer 6 External/S...te/538.imagick_r/538.imagick_r 18 External/S.../CFP2006/447.dealII/447.dealII 4 MultiSourc...OE-ProxyApps-C++/miniFE/miniFE 12 External/S...2017rate/525.x264_r/525.x264_r 36 MultiSourc...Benchmarks/7zip/7zip-benchmark 33 MultiSourc...hmarks/ASC_Sequoia/AMGmk/AMGmk 2 MultiSourc...chmarks/VersaBench/8b10b/8b10b 1 MultiSourc.../Applications/JM/lencod/lencod 116 MultiSourc...lds-dbl/CrossingThresholds-dbl 24 MultiSource/Benchmarks/McCat/05-eks/eks 15 ``` PR: #83860
1 parent b6328db commit 175d297

File tree

8 files changed

+257
-52
lines changed

8 files changed

+257
-52
lines changed

llvm/include/llvm/Analysis/MemorySSA.h

Lines changed: 10 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -110,6 +110,7 @@ namespace llvm {
110110
template <class GraphType> struct GraphTraits;
111111
class BasicBlock;
112112
class Function;
113+
class Loop;
113114
class Instruction;
114115
class LLVMContext;
115116
class MemoryAccess;
@@ -700,6 +701,7 @@ DEFINE_TRANSPARENT_OPERAND_ACCESSORS(MemoryPhi, MemoryAccess)
700701
class MemorySSA {
701702
public:
702703
MemorySSA(Function &, AliasAnalysis *, DominatorTree *);
704+
MemorySSA(Loop &, AliasAnalysis *, DominatorTree *);
703705

704706
// MemorySSA must remain where it's constructed; Walkers it creates store
705707
// pointers to it.
@@ -800,10 +802,11 @@ class MemorySSA {
800802
// Used by Memory SSA dumpers and wrapper pass
801803
friend class MemorySSAUpdater;
802804

805+
template <typename IterT>
803806
void verifyOrderingDominationAndDefUses(
804-
Function &F, VerificationLevel = VerificationLevel::Fast) const;
805-
void verifyDominationNumbers(const Function &F) const;
806-
void verifyPrevDefInPhis(Function &F) const;
807+
IterT Blocks, VerificationLevel = VerificationLevel::Fast) const;
808+
template <typename IterT> void verifyDominationNumbers(IterT Blocks) const;
809+
template <typename IterT> void verifyPrevDefInPhis(IterT Blocks) const;
807810

808811
// This is used by the use optimizer and updater.
809812
AccessList *getWritableBlockAccesses(const BasicBlock *BB) const {
@@ -847,7 +850,8 @@ class MemorySSA {
847850
class OptimizeUses;
848851

849852
CachingWalker *getWalkerImpl();
850-
void buildMemorySSA(BatchAAResults &BAA);
853+
template <typename IterT>
854+
void buildMemorySSA(BatchAAResults &BAA, IterT Blocks);
851855

852856
void prepareForMoveTo(MemoryAccess *, BasicBlock *);
853857
void verifyUseInDefs(MemoryAccess *, MemoryAccess *) const;
@@ -871,7 +875,8 @@ class MemorySSA {
871875
void renumberBlock(const BasicBlock *) const;
872876
AliasAnalysis *AA = nullptr;
873877
DominatorTree *DT;
874-
Function &F;
878+
Function *F = nullptr;
879+
Loop *L = nullptr;
875880

876881
// Memory SSA mappings
877882
DenseMap<const Value *, MemoryAccess *> ValueToMemoryAccess;

llvm/include/llvm/Transforms/Utils/UnrollLoop.h

Lines changed: 5 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -22,6 +22,7 @@
2222
namespace llvm {
2323

2424
class AssumptionCache;
25+
class AAResults;
2526
class BasicBlock;
2627
class BlockFrequencyInfo;
2728
class DependenceInfo;
@@ -79,7 +80,8 @@ LoopUnrollResult UnrollLoop(Loop *L, UnrollLoopOptions ULO, LoopInfo *LI,
7980
AssumptionCache *AC,
8081
const llvm::TargetTransformInfo *TTI,
8182
OptimizationRemarkEmitter *ORE, bool PreserveLCSSA,
82-
Loop **RemainderLoop = nullptr);
83+
Loop **RemainderLoop = nullptr,
84+
AAResults *AA = nullptr);
8385

8486
bool UnrollRuntimeLoopRemainder(
8587
Loop *L, unsigned Count, bool AllowExpensiveTripCount,
@@ -102,7 +104,8 @@ bool isSafeToUnrollAndJam(Loop *L, ScalarEvolution &SE, DominatorTree &DT,
102104
void simplifyLoopAfterUnroll(Loop *L, bool SimplifyIVs, LoopInfo *LI,
103105
ScalarEvolution *SE, DominatorTree *DT,
104106
AssumptionCache *AC,
105-
const TargetTransformInfo *TTI);
107+
const TargetTransformInfo *TTI,
108+
AAResults *AA = nullptr);
106109

107110
MDNode *GetUnrollMetadata(MDNode *LoopID, StringRef Name);
108111

llvm/lib/Analysis/MemorySSA.cpp

Lines changed: 81 additions & 21 deletions
Original file line numberDiff line numberDiff line change
@@ -25,6 +25,7 @@
2525
#include "llvm/Analysis/AliasAnalysis.h"
2626
#include "llvm/Analysis/CFGPrinter.h"
2727
#include "llvm/Analysis/IteratedDominanceFrontier.h"
28+
#include "llvm/Analysis/LoopInfo.h"
2829
#include "llvm/Analysis/MemoryLocation.h"
2930
#include "llvm/Config/llvm-config.h"
3031
#include "llvm/IR/AssemblyAnnotationWriter.h"
@@ -1230,7 +1231,7 @@ void MemorySSA::markUnreachableAsLiveOnEntry(BasicBlock *BB) {
12301231
}
12311232

12321233
MemorySSA::MemorySSA(Function &Func, AliasAnalysis *AA, DominatorTree *DT)
1233-
: DT(DT), F(Func), LiveOnEntryDef(nullptr), Walker(nullptr),
1234+
: DT(DT), F(&Func), LiveOnEntryDef(nullptr), Walker(nullptr),
12341235
SkipWalker(nullptr) {
12351236
// Build MemorySSA using a batch alias analysis. This reuses the internal
12361237
// state that AA collects during an alias()/getModRefInfo() call. This is
@@ -1239,8 +1240,29 @@ MemorySSA::MemorySSA(Function &Func, AliasAnalysis *AA, DominatorTree *DT)
12391240
// make queries about all the instructions in the Function.
12401241
assert(AA && "No alias analysis?");
12411242
BatchAAResults BatchAA(*AA);
1242-
buildMemorySSA(BatchAA);
1243-
// Intentionally leave AA to nullptr while building so we don't accidently
1243+
buildMemorySSA(BatchAA, iterator_range(F->begin(), F->end()));
1244+
// Intentionally leave AA to nullptr while building so we don't accidentally
1245+
// use non-batch AliasAnalysis.
1246+
this->AA = AA;
1247+
// Also create the walker here.
1248+
getWalker();
1249+
}
1250+
1251+
MemorySSA::MemorySSA(Loop &L, AliasAnalysis *AA, DominatorTree *DT)
1252+
: DT(DT), L(&L), LiveOnEntryDef(nullptr), Walker(nullptr),
1253+
SkipWalker(nullptr) {
1254+
// Build MemorySSA using a batch alias analysis. This reuses the internal
1255+
// state that AA collects during an alias()/getModRefInfo() call. This is
1256+
// safe because there are no CFG changes while building MemorySSA and can
1257+
// significantly reduce the time spent by the compiler in AA, because we will
1258+
// make queries about all the instructions in the Function.
1259+
assert(AA && "No alias analysis?");
1260+
BatchAAResults BatchAA(*AA);
1261+
buildMemorySSA(
1262+
BatchAA, map_range(L.blocks(), [](const BasicBlock *BB) -> BasicBlock & {
1263+
return *const_cast<BasicBlock *>(BB);
1264+
}));
1265+
// Intentionally leave AA to nullptr while building so we don't accidentally
12441266
// use non-batch AliasAnalysis.
12451267
this->AA = AA;
12461268
// Also create the walker here.
@@ -1493,24 +1515,25 @@ void MemorySSA::placePHINodes(
14931515
createMemoryPhi(BB);
14941516
}
14951517

1496-
void MemorySSA::buildMemorySSA(BatchAAResults &BAA) {
1518+
template <typename IterT>
1519+
void MemorySSA::buildMemorySSA(BatchAAResults &BAA, IterT Blocks) {
14971520
// We create an access to represent "live on entry", for things like
14981521
// arguments or users of globals, where the memory they use is defined before
14991522
// the beginning of the function. We do not actually insert it into the IR.
15001523
// We do not define a live on exit for the immediate uses, and thus our
15011524
// semantics do *not* imply that something with no immediate uses can simply
15021525
// be removed.
1503-
BasicBlock &StartingPoint = F.getEntryBlock();
1504-
LiveOnEntryDef.reset(new MemoryDef(F.getContext(), nullptr, nullptr,
1505-
&StartingPoint, NextID++));
1526+
BasicBlock &StartingPoint = *Blocks.begin();
1527+
LiveOnEntryDef.reset(new MemoryDef(StartingPoint.getContext(), nullptr,
1528+
nullptr, &StartingPoint, NextID++));
15061529

15071530
// We maintain lists of memory accesses per-block, trading memory for time. We
15081531
// could just look up the memory access for every possible instruction in the
15091532
// stream.
15101533
SmallPtrSet<BasicBlock *, 32> DefiningBlocks;
15111534
// Go through each block, figure out where defs occur, and chain together all
15121535
// the accesses.
1513-
for (BasicBlock &B : F) {
1536+
for (BasicBlock &B : Blocks) {
15141537
bool InsertIntoDef = false;
15151538
AccessList *Accesses = nullptr;
15161539
DefsList *Defs = nullptr;
@@ -1537,11 +1560,29 @@ void MemorySSA::buildMemorySSA(BatchAAResults &BAA) {
15371560
// Now do regular SSA renaming on the MemoryDef/MemoryUse. Visited will get
15381561
// filled in with all blocks.
15391562
SmallPtrSet<BasicBlock *, 16> Visited;
1540-
renamePass(DT->getRootNode(), LiveOnEntryDef.get(), Visited);
1563+
if (L) {
1564+
// Only building MemorySSA for a single loop. placePHINodes may have
1565+
// inserted a MemoryPhi in the loop's preheader. As this is outside the
1566+
// scope of the loop, set them to LiveOnEntry.
1567+
if (auto *P = getMemoryAccess(L->getLoopPreheader())) {
1568+
for (Use &U : make_early_inc_range(P->uses()))
1569+
U.set(LiveOnEntryDef.get());
1570+
removeFromLists(P);
1571+
}
1572+
// Now rename accesses in the loop. Populate Visited with the exit blocks of
1573+
// the loop, to limit the scope of the renaming.
1574+
SmallVector<BasicBlock *> ExitBlocks;
1575+
L->getExitBlocks(ExitBlocks);
1576+
Visited.insert(ExitBlocks.begin(), ExitBlocks.end());
1577+
renamePass(DT->getNode(L->getLoopPreheader()), LiveOnEntryDef.get(),
1578+
Visited);
1579+
} else {
1580+
renamePass(DT->getRootNode(), LiveOnEntryDef.get(), Visited);
1581+
}
15411582

15421583
// Mark the uses in unreachable blocks as live on entry, so that they go
15431584
// somewhere.
1544-
for (auto &BB : F)
1585+
for (auto &BB : Blocks)
15451586
if (!Visited.count(&BB))
15461587
markUnreachableAsLiveOnEntry(&BB);
15471588
}
@@ -1851,7 +1892,10 @@ void MemorySSA::removeFromLists(MemoryAccess *MA, bool ShouldDelete) {
18511892

18521893
void MemorySSA::print(raw_ostream &OS) const {
18531894
MemorySSAAnnotatedWriter Writer(this);
1854-
F.print(OS, &Writer);
1895+
Function *F = this->F;
1896+
if (L)
1897+
F = L->getHeader()->getParent();
1898+
F->print(OS, &Writer);
18551899
}
18561900

18571901
#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
@@ -1864,10 +1908,23 @@ void MemorySSA::verifyMemorySSA(VerificationLevel VL) const {
18641908
#endif
18651909

18661910
#ifndef NDEBUG
1867-
verifyOrderingDominationAndDefUses(F, VL);
1868-
verifyDominationNumbers(F);
1869-
if (VL == VerificationLevel::Full)
1870-
verifyPrevDefInPhis(F);
1911+
if (F) {
1912+
auto Blocks = iterator_range(F->begin(), F->end());
1913+
verifyOrderingDominationAndDefUses(Blocks, VL);
1914+
verifyDominationNumbers(Blocks);
1915+
if (VL == VerificationLevel::Full)
1916+
verifyPrevDefInPhis(Blocks);
1917+
} else {
1918+
assert(L && "must either have loop or function");
1919+
auto Blocks =
1920+
map_range(L->blocks(), [](const BasicBlock *BB) -> BasicBlock & {
1921+
return *const_cast<BasicBlock *>(BB);
1922+
});
1923+
verifyOrderingDominationAndDefUses(Blocks, VL);
1924+
verifyDominationNumbers(Blocks);
1925+
if (VL == VerificationLevel::Full)
1926+
verifyPrevDefInPhis(Blocks);
1927+
}
18711928
#endif
18721929
// Previously, the verification used to also verify that the clobberingAccess
18731930
// cached by MemorySSA is the same as the clobberingAccess found at a later
@@ -1881,8 +1938,9 @@ void MemorySSA::verifyMemorySSA(VerificationLevel VL) const {
18811938
// example, see test4 added in D51960.
18821939
}
18831940

1884-
void MemorySSA::verifyPrevDefInPhis(Function &F) const {
1885-
for (const BasicBlock &BB : F) {
1941+
template <typename IterT>
1942+
void MemorySSA::verifyPrevDefInPhis(IterT Blocks) const {
1943+
for (const BasicBlock &BB : Blocks) {
18861944
if (MemoryPhi *Phi = getMemoryAccess(&BB)) {
18871945
for (unsigned I = 0, E = Phi->getNumIncomingValues(); I != E; ++I) {
18881946
auto *Pred = Phi->getIncomingBlock(I);
@@ -1917,12 +1975,13 @@ void MemorySSA::verifyPrevDefInPhis(Function &F) const {
19171975

19181976
/// Verify that all of the blocks we believe to have valid domination numbers
19191977
/// actually have valid domination numbers.
1920-
void MemorySSA::verifyDominationNumbers(const Function &F) const {
1978+
template <typename IterT>
1979+
void MemorySSA::verifyDominationNumbers(IterT Blocks) const {
19211980
if (BlockNumberingValid.empty())
19221981
return;
19231982

19241983
SmallPtrSet<const BasicBlock *, 16> ValidBlocks = BlockNumberingValid;
1925-
for (const BasicBlock &BB : F) {
1984+
for (const BasicBlock &BB : Blocks) {
19261985
if (!ValidBlocks.count(&BB))
19271986
continue;
19281987

@@ -1958,14 +2017,15 @@ void MemorySSA::verifyDominationNumbers(const Function &F) const {
19582017
/// Verify def-uses: the immediate use information - walk all the memory
19592018
/// accesses and verifying that, for each use, it appears in the appropriate
19602019
/// def's use list
1961-
void MemorySSA::verifyOrderingDominationAndDefUses(Function &F,
2020+
template <typename IterT>
2021+
void MemorySSA::verifyOrderingDominationAndDefUses(IterT Blocks,
19622022
VerificationLevel VL) const {
19632023
// Walk all the blocks, comparing what the lookups think and what the access
19642024
// lists think, as well as the order in the blocks vs the order in the access
19652025
// lists.
19662026
SmallVector<MemoryAccess *, 32> ActualAccesses;
19672027
SmallVector<MemoryAccess *, 32> ActualDefs;
1968-
for (BasicBlock &B : F) {
2028+
for (BasicBlock &B : Blocks) {
19692029
const AccessList *AL = getBlockAccesses(&B);
19702030
const auto *DL = getBlockDefs(&B);
19712031
MemoryPhi *Phi = getMemoryAccess(&B);

llvm/lib/Transforms/Scalar/LoopUnrollPass.cpp

Lines changed: 9 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -16,6 +16,7 @@
1616
#include "llvm/ADT/DenseMapInfo.h"
1717
#include "llvm/ADT/DenseSet.h"
1818
#include "llvm/ADT/STLExtras.h"
19+
#include "llvm/ADT/ScopedHashTable.h"
1920
#include "llvm/ADT/SetVector.h"
2021
#include "llvm/ADT/SmallPtrSet.h"
2122
#include "llvm/ADT/SmallVector.h"
@@ -27,6 +28,7 @@
2728
#include "llvm/Analysis/LoopInfo.h"
2829
#include "llvm/Analysis/LoopPass.h"
2930
#include "llvm/Analysis/LoopUnrollAnalyzer.h"
31+
#include "llvm/Analysis/MemorySSA.h"
3032
#include "llvm/Analysis/OptimizationRemarkEmitter.h"
3133
#include "llvm/Analysis/ProfileSummaryInfo.h"
3234
#include "llvm/Analysis/ScalarEvolution.h"
@@ -1140,7 +1142,8 @@ tryToUnrollLoop(Loop *L, DominatorTree &DT, LoopInfo *LI, ScalarEvolution &SE,
11401142
std::optional<bool> ProvidedUpperBound,
11411143
std::optional<bool> ProvidedAllowPeeling,
11421144
std::optional<bool> ProvidedAllowProfileBasedPeeling,
1143-
std::optional<unsigned> ProvidedFullUnrollMaxCount) {
1145+
std::optional<unsigned> ProvidedFullUnrollMaxCount,
1146+
AAResults *AA = nullptr) {
11441147

11451148
LLVM_DEBUG(dbgs() << "Loop Unroll: F["
11461149
<< L->getHeader()->getParent()->getName() << "] Loop %"
@@ -1292,7 +1295,7 @@ tryToUnrollLoop(Loop *L, DominatorTree &DT, LoopInfo *LI, ScalarEvolution &SE,
12921295

12931296
ValueToValueMapTy VMap;
12941297
if (peelLoop(L, PP.PeelCount, LI, &SE, DT, &AC, PreserveLCSSA, VMap)) {
1295-
simplifyLoopAfterUnroll(L, true, LI, &SE, &DT, &AC, &TTI);
1298+
simplifyLoopAfterUnroll(L, true, LI, &SE, &DT, &AC, &TTI, nullptr);
12961299
// If the loop was peeled, we already "used up" the profile information
12971300
// we had, so we don't want to unroll or peel again.
12981301
if (PP.PeelProfiledIterations)
@@ -1325,7 +1328,7 @@ tryToUnrollLoop(Loop *L, DominatorTree &DT, LoopInfo *LI, ScalarEvolution &SE,
13251328
L,
13261329
{UP.Count, UP.Force, UP.Runtime, UP.AllowExpensiveTripCount,
13271330
UP.UnrollRemainder, ForgetAllSCEV},
1328-
LI, &SE, &DT, &AC, &TTI, &ORE, PreserveLCSSA, &RemainderLoop);
1331+
LI, &SE, &DT, &AC, &TTI, &ORE, PreserveLCSSA, &RemainderLoop, AA);
13291332
if (UnrollResult == LoopUnrollResult::Unmodified)
13301333
return LoopUnrollResult::Unmodified;
13311334

@@ -1572,6 +1575,7 @@ PreservedAnalyses LoopUnrollPass::run(Function &F,
15721575
auto &DT = AM.getResult<DominatorTreeAnalysis>(F);
15731576
auto &AC = AM.getResult<AssumptionAnalysis>(F);
15741577
auto &ORE = AM.getResult<OptimizationRemarkEmitterAnalysis>(F);
1578+
AAResults &AA = AM.getResult<AAManager>(F);
15751579

15761580
LoopAnalysisManager *LAM = nullptr;
15771581
if (auto *LAMProxy = AM.getCachedResult<LoopAnalysisManagerFunctionProxy>(F))
@@ -1627,7 +1631,8 @@ PreservedAnalyses LoopUnrollPass::run(Function &F,
16271631
/*Count*/ std::nullopt,
16281632
/*Threshold*/ std::nullopt, UnrollOpts.AllowPartial,
16291633
UnrollOpts.AllowRuntime, UnrollOpts.AllowUpperBound, LocalAllowPeeling,
1630-
UnrollOpts.AllowProfileBasedPeeling, UnrollOpts.FullUnrollMaxCount);
1634+
UnrollOpts.AllowProfileBasedPeeling, UnrollOpts.FullUnrollMaxCount,
1635+
&AA);
16311636
Changed |= Result != LoopUnrollResult::Unmodified;
16321637

16331638
// The parent must not be damaged by unrolling!

0 commit comments

Comments
 (0)