Skip to content

[LoopUnroll] Add CSE to remove redundant loads after unrolling. #83860

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 13 commits into from
May 2, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
15 changes: 10 additions & 5 deletions llvm/include/llvm/Analysis/MemorySSA.h
Original file line number Diff line number Diff line change
Expand Up @@ -110,6 +110,7 @@ namespace llvm {
template <class GraphType> struct GraphTraits;
class BasicBlock;
class Function;
class Loop;
class Instruction;
class LLVMContext;
class MemoryAccess;
Expand Down Expand Up @@ -700,6 +701,7 @@ DEFINE_TRANSPARENT_OPERAND_ACCESSORS(MemoryPhi, MemoryAccess)
class MemorySSA {
public:
MemorySSA(Function &, AliasAnalysis *, DominatorTree *);
MemorySSA(Loop &, AliasAnalysis *, DominatorTree *);

// MemorySSA must remain where it's constructed; Walkers it creates store
// pointers to it.
Expand Down Expand Up @@ -800,10 +802,11 @@ class MemorySSA {
// Used by Memory SSA dumpers and wrapper pass
friend class MemorySSAUpdater;

template <typename IterT>
void verifyOrderingDominationAndDefUses(
Function &F, VerificationLevel = VerificationLevel::Fast) const;
void verifyDominationNumbers(const Function &F) const;
void verifyPrevDefInPhis(Function &F) const;
IterT Blocks, VerificationLevel = VerificationLevel::Fast) const;
template <typename IterT> void verifyDominationNumbers(IterT Blocks) const;
template <typename IterT> void verifyPrevDefInPhis(IterT Blocks) const;

// This is used by the use optimizer and updater.
AccessList *getWritableBlockAccesses(const BasicBlock *BB) const {
Expand Down Expand Up @@ -847,7 +850,8 @@ class MemorySSA {
class OptimizeUses;

CachingWalker *getWalkerImpl();
void buildMemorySSA(BatchAAResults &BAA);
template <typename IterT>
void buildMemorySSA(BatchAAResults &BAA, IterT Blocks);

void prepareForMoveTo(MemoryAccess *, BasicBlock *);
void verifyUseInDefs(MemoryAccess *, MemoryAccess *) const;
Expand All @@ -871,7 +875,8 @@ class MemorySSA {
void renumberBlock(const BasicBlock *) const;
AliasAnalysis *AA = nullptr;
DominatorTree *DT;
Function &F;
Function *F = nullptr;
Loop *L = nullptr;

// Memory SSA mappings
DenseMap<const Value *, MemoryAccess *> ValueToMemoryAccess;
Expand Down
7 changes: 5 additions & 2 deletions llvm/include/llvm/Transforms/Utils/UnrollLoop.h
Original file line number Diff line number Diff line change
Expand Up @@ -22,6 +22,7 @@
namespace llvm {

class AssumptionCache;
class AAResults;
class BasicBlock;
class BlockFrequencyInfo;
class DependenceInfo;
Expand Down Expand Up @@ -79,7 +80,8 @@ LoopUnrollResult UnrollLoop(Loop *L, UnrollLoopOptions ULO, LoopInfo *LI,
AssumptionCache *AC,
const llvm::TargetTransformInfo *TTI,
OptimizationRemarkEmitter *ORE, bool PreserveLCSSA,
Loop **RemainderLoop = nullptr);
Loop **RemainderLoop = nullptr,
AAResults *AA = nullptr);

bool UnrollRuntimeLoopRemainder(
Loop *L, unsigned Count, bool AllowExpensiveTripCount,
Expand All @@ -102,7 +104,8 @@ bool isSafeToUnrollAndJam(Loop *L, ScalarEvolution &SE, DominatorTree &DT,
void simplifyLoopAfterUnroll(Loop *L, bool SimplifyIVs, LoopInfo *LI,
ScalarEvolution *SE, DominatorTree *DT,
AssumptionCache *AC,
const TargetTransformInfo *TTI);
const TargetTransformInfo *TTI,
AAResults *AA = nullptr);

MDNode *GetUnrollMetadata(MDNode *LoopID, StringRef Name);

Expand Down
102 changes: 81 additions & 21 deletions llvm/lib/Analysis/MemorySSA.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -25,6 +25,7 @@
#include "llvm/Analysis/AliasAnalysis.h"
#include "llvm/Analysis/CFGPrinter.h"
#include "llvm/Analysis/IteratedDominanceFrontier.h"
#include "llvm/Analysis/LoopInfo.h"
#include "llvm/Analysis/MemoryLocation.h"
#include "llvm/Config/llvm-config.h"
#include "llvm/IR/AssemblyAnnotationWriter.h"
Expand Down Expand Up @@ -1230,7 +1231,7 @@ void MemorySSA::markUnreachableAsLiveOnEntry(BasicBlock *BB) {
}

MemorySSA::MemorySSA(Function &Func, AliasAnalysis *AA, DominatorTree *DT)
: DT(DT), F(Func), LiveOnEntryDef(nullptr), Walker(nullptr),
: DT(DT), F(&Func), LiveOnEntryDef(nullptr), Walker(nullptr),
SkipWalker(nullptr) {
// Build MemorySSA using a batch alias analysis. This reuses the internal
// state that AA collects during an alias()/getModRefInfo() call. This is
Expand All @@ -1239,8 +1240,29 @@ MemorySSA::MemorySSA(Function &Func, AliasAnalysis *AA, DominatorTree *DT)
// make queries about all the instructions in the Function.
assert(AA && "No alias analysis?");
BatchAAResults BatchAA(*AA);
buildMemorySSA(BatchAA);
// Intentionally leave AA to nullptr while building so we don't accidently
buildMemorySSA(BatchAA, iterator_range(F->begin(), F->end()));
// Intentionally leave AA to nullptr while building so we don't accidentally
// use non-batch AliasAnalysis.
this->AA = AA;
// Also create the walker here.
getWalker();
}

MemorySSA::MemorySSA(Loop &L, AliasAnalysis *AA, DominatorTree *DT)
: DT(DT), L(&L), LiveOnEntryDef(nullptr), Walker(nullptr),
SkipWalker(nullptr) {
// Build MemorySSA using a batch alias analysis. This reuses the internal
// state that AA collects during an alias()/getModRefInfo() call. This is
// safe because there are no CFG changes while building MemorySSA and can
// significantly reduce the time spent by the compiler in AA, because we will
// make queries about all the instructions in the Function.
assert(AA && "No alias analysis?");
BatchAAResults BatchAA(*AA);
buildMemorySSA(
BatchAA, map_range(L.blocks(), [](const BasicBlock *BB) -> BasicBlock & {
return *const_cast<BasicBlock *>(BB);
}));
// Intentionally leave AA to nullptr while building so we don't accidentally
// use non-batch AliasAnalysis.
this->AA = AA;
// Also create the walker here.
Expand Down Expand Up @@ -1493,24 +1515,25 @@ void MemorySSA::placePHINodes(
createMemoryPhi(BB);
}

void MemorySSA::buildMemorySSA(BatchAAResults &BAA) {
template <typename IterT>
void MemorySSA::buildMemorySSA(BatchAAResults &BAA, IterT Blocks) {
// We create an access to represent "live on entry", for things like
// arguments or users of globals, where the memory they use is defined before
// the beginning of the function. We do not actually insert it into the IR.
// We do not define a live on exit for the immediate uses, and thus our
// semantics do *not* imply that something with no immediate uses can simply
// be removed.
BasicBlock &StartingPoint = F.getEntryBlock();
LiveOnEntryDef.reset(new MemoryDef(F.getContext(), nullptr, nullptr,
&StartingPoint, NextID++));
BasicBlock &StartingPoint = *Blocks.begin();
LiveOnEntryDef.reset(new MemoryDef(StartingPoint.getContext(), nullptr,
nullptr, &StartingPoint, NextID++));

// We maintain lists of memory accesses per-block, trading memory for time. We
// could just look up the memory access for every possible instruction in the
// stream.
SmallPtrSet<BasicBlock *, 32> DefiningBlocks;
// Go through each block, figure out where defs occur, and chain together all
// the accesses.
for (BasicBlock &B : F) {
for (BasicBlock &B : Blocks) {
bool InsertIntoDef = false;
AccessList *Accesses = nullptr;
DefsList *Defs = nullptr;
Expand All @@ -1537,11 +1560,29 @@ void MemorySSA::buildMemorySSA(BatchAAResults &BAA) {
// Now do regular SSA renaming on the MemoryDef/MemoryUse. Visited will get
// filled in with all blocks.
SmallPtrSet<BasicBlock *, 16> Visited;
renamePass(DT->getRootNode(), LiveOnEntryDef.get(), Visited);
if (L) {
// Only building MemorySSA for a single loop. placePHINodes may have
// inserted a MemoryPhi in the loop's preheader. As this is outside the
// scope of the loop, set them to LiveOnEntry.
if (auto *P = getMemoryAccess(L->getLoopPreheader())) {
for (Use &U : make_early_inc_range(P->uses()))
U.set(LiveOnEntryDef.get());
removeFromLists(P);
}
// Now rename accesses in the loop. Populate Visited with the exit blocks of
// the loop, to limit the scope of the renaming.
SmallVector<BasicBlock *> ExitBlocks;
L->getExitBlocks(ExitBlocks);
Visited.insert(ExitBlocks.begin(), ExitBlocks.end());
renamePass(DT->getNode(L->getLoopPreheader()), LiveOnEntryDef.get(),
Visited);
} else {
renamePass(DT->getRootNode(), LiveOnEntryDef.get(), Visited);
}

// Mark the uses in unreachable blocks as live on entry, so that they go
// somewhere.
for (auto &BB : F)
for (auto &BB : Blocks)
if (!Visited.count(&BB))
markUnreachableAsLiveOnEntry(&BB);
}
Expand Down Expand Up @@ -1851,7 +1892,10 @@ void MemorySSA::removeFromLists(MemoryAccess *MA, bool ShouldDelete) {

void MemorySSA::print(raw_ostream &OS) const {
MemorySSAAnnotatedWriter Writer(this);
F.print(OS, &Writer);
Function *F = this->F;
if (L)
F = L->getHeader()->getParent();
F->print(OS, &Writer);
}

#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
Expand All @@ -1864,10 +1908,23 @@ void MemorySSA::verifyMemorySSA(VerificationLevel VL) const {
#endif

#ifndef NDEBUG
verifyOrderingDominationAndDefUses(F, VL);
verifyDominationNumbers(F);
if (VL == VerificationLevel::Full)
verifyPrevDefInPhis(F);
if (F) {
auto Blocks = iterator_range(F->begin(), F->end());
verifyOrderingDominationAndDefUses(Blocks, VL);
verifyDominationNumbers(Blocks);
if (VL == VerificationLevel::Full)
verifyPrevDefInPhis(Blocks);
} else {
assert(L && "must either have loop or function");
auto Blocks =
map_range(L->blocks(), [](const BasicBlock *BB) -> BasicBlock & {
return *const_cast<BasicBlock *>(BB);
});
verifyOrderingDominationAndDefUses(Blocks, VL);
verifyDominationNumbers(Blocks);
if (VL == VerificationLevel::Full)
verifyPrevDefInPhis(Blocks);
}
#endif
// Previously, the verification used to also verify that the clobberingAccess
// cached by MemorySSA is the same as the clobberingAccess found at a later
Expand All @@ -1881,8 +1938,9 @@ void MemorySSA::verifyMemorySSA(VerificationLevel VL) const {
// example, see test4 added in D51960.
}

void MemorySSA::verifyPrevDefInPhis(Function &F) const {
for (const BasicBlock &BB : F) {
template <typename IterT>
void MemorySSA::verifyPrevDefInPhis(IterT Blocks) const {
for (const BasicBlock &BB : Blocks) {
if (MemoryPhi *Phi = getMemoryAccess(&BB)) {
for (unsigned I = 0, E = Phi->getNumIncomingValues(); I != E; ++I) {
auto *Pred = Phi->getIncomingBlock(I);
Expand Down Expand Up @@ -1917,12 +1975,13 @@ void MemorySSA::verifyPrevDefInPhis(Function &F) const {

/// Verify that all of the blocks we believe to have valid domination numbers
/// actually have valid domination numbers.
void MemorySSA::verifyDominationNumbers(const Function &F) const {
template <typename IterT>
void MemorySSA::verifyDominationNumbers(IterT Blocks) const {
if (BlockNumberingValid.empty())
return;

SmallPtrSet<const BasicBlock *, 16> ValidBlocks = BlockNumberingValid;
for (const BasicBlock &BB : F) {
for (const BasicBlock &BB : Blocks) {
if (!ValidBlocks.count(&BB))
continue;

Expand Down Expand Up @@ -1958,14 +2017,15 @@ void MemorySSA::verifyDominationNumbers(const Function &F) const {
/// Verify def-uses: the immediate use information - walk all the memory
/// accesses and verifying that, for each use, it appears in the appropriate
/// def's use list
void MemorySSA::verifyOrderingDominationAndDefUses(Function &F,
template <typename IterT>
void MemorySSA::verifyOrderingDominationAndDefUses(IterT Blocks,
VerificationLevel VL) const {
// Walk all the blocks, comparing what the lookups think and what the access
// lists think, as well as the order in the blocks vs the order in the access
// lists.
SmallVector<MemoryAccess *, 32> ActualAccesses;
SmallVector<MemoryAccess *, 32> ActualDefs;
for (BasicBlock &B : F) {
for (BasicBlock &B : Blocks) {
const AccessList *AL = getBlockAccesses(&B);
const auto *DL = getBlockDefs(&B);
MemoryPhi *Phi = getMemoryAccess(&B);
Expand Down
13 changes: 9 additions & 4 deletions llvm/lib/Transforms/Scalar/LoopUnrollPass.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,7 @@
#include "llvm/ADT/DenseMapInfo.h"
#include "llvm/ADT/DenseSet.h"
#include "llvm/ADT/STLExtras.h"
#include "llvm/ADT/ScopedHashTable.h"
#include "llvm/ADT/SetVector.h"
#include "llvm/ADT/SmallPtrSet.h"
#include "llvm/ADT/SmallVector.h"
Expand All @@ -27,6 +28,7 @@
#include "llvm/Analysis/LoopInfo.h"
#include "llvm/Analysis/LoopPass.h"
#include "llvm/Analysis/LoopUnrollAnalyzer.h"
#include "llvm/Analysis/MemorySSA.h"
#include "llvm/Analysis/OptimizationRemarkEmitter.h"
#include "llvm/Analysis/ProfileSummaryInfo.h"
#include "llvm/Analysis/ScalarEvolution.h"
Expand Down Expand Up @@ -1140,7 +1142,8 @@ tryToUnrollLoop(Loop *L, DominatorTree &DT, LoopInfo *LI, ScalarEvolution &SE,
std::optional<bool> ProvidedUpperBound,
std::optional<bool> ProvidedAllowPeeling,
std::optional<bool> ProvidedAllowProfileBasedPeeling,
std::optional<unsigned> ProvidedFullUnrollMaxCount) {
std::optional<unsigned> ProvidedFullUnrollMaxCount,
AAResults *AA = nullptr) {

LLVM_DEBUG(dbgs() << "Loop Unroll: F["
<< L->getHeader()->getParent()->getName() << "] Loop %"
Expand Down Expand Up @@ -1292,7 +1295,7 @@ tryToUnrollLoop(Loop *L, DominatorTree &DT, LoopInfo *LI, ScalarEvolution &SE,

ValueToValueMapTy VMap;
if (peelLoop(L, PP.PeelCount, LI, &SE, DT, &AC, PreserveLCSSA, VMap)) {
simplifyLoopAfterUnroll(L, true, LI, &SE, &DT, &AC, &TTI);
simplifyLoopAfterUnroll(L, true, LI, &SE, &DT, &AC, &TTI, nullptr);
// If the loop was peeled, we already "used up" the profile information
// we had, so we don't want to unroll or peel again.
if (PP.PeelProfiledIterations)
Expand Down Expand Up @@ -1325,7 +1328,7 @@ tryToUnrollLoop(Loop *L, DominatorTree &DT, LoopInfo *LI, ScalarEvolution &SE,
L,
{UP.Count, UP.Force, UP.Runtime, UP.AllowExpensiveTripCount,
UP.UnrollRemainder, ForgetAllSCEV},
LI, &SE, &DT, &AC, &TTI, &ORE, PreserveLCSSA, &RemainderLoop);
LI, &SE, &DT, &AC, &TTI, &ORE, PreserveLCSSA, &RemainderLoop, AA);
if (UnrollResult == LoopUnrollResult::Unmodified)
return LoopUnrollResult::Unmodified;

Expand Down Expand Up @@ -1572,6 +1575,7 @@ PreservedAnalyses LoopUnrollPass::run(Function &F,
auto &DT = AM.getResult<DominatorTreeAnalysis>(F);
auto &AC = AM.getResult<AssumptionAnalysis>(F);
auto &ORE = AM.getResult<OptimizationRemarkEmitterAnalysis>(F);
AAResults &AA = AM.getResult<AAManager>(F);

LoopAnalysisManager *LAM = nullptr;
if (auto *LAMProxy = AM.getCachedResult<LoopAnalysisManagerFunctionProxy>(F))
Expand Down Expand Up @@ -1627,7 +1631,8 @@ PreservedAnalyses LoopUnrollPass::run(Function &F,
/*Count*/ std::nullopt,
/*Threshold*/ std::nullopt, UnrollOpts.AllowPartial,
UnrollOpts.AllowRuntime, UnrollOpts.AllowUpperBound, LocalAllowPeeling,
UnrollOpts.AllowProfileBasedPeeling, UnrollOpts.FullUnrollMaxCount);
UnrollOpts.AllowProfileBasedPeeling, UnrollOpts.FullUnrollMaxCount,
&AA);
Changed |= Result != LoopUnrollResult::Unmodified;

// The parent must not be damaged by unrolling!
Expand Down
Loading