Skip to content

[MachineSink] Extend loop sinking capability #117247

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 12 commits into from
Jan 24, 2025
267 changes: 176 additions & 91 deletions llvm/lib/CodeGen/MachineSink.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -45,6 +45,7 @@
#include "llvm/CodeGen/TargetInstrInfo.h"
#include "llvm/CodeGen/TargetPassConfig.h"
#include "llvm/CodeGen/TargetRegisterInfo.h"
#include "llvm/CodeGen/TargetSchedule.h"
#include "llvm/CodeGen/TargetSubtargetInfo.h"
#include "llvm/IR/BasicBlock.h"
#include "llvm/IR/DebugInfoMetadata.h"
Expand Down Expand Up @@ -113,6 +114,8 @@ STATISTIC(NumSplit, "Number of critical edges split");
STATISTIC(NumCoalesces, "Number of copies coalesced");
STATISTIC(NumPostRACopySink, "Number of copies sunk after RA");

using RegSubRegPair = TargetInstrInfo::RegSubRegPair;

namespace {

class MachineSinking : public MachineFunctionPass {
Expand All @@ -128,6 +131,7 @@ class MachineSinking : public MachineFunctionPass {
const MachineBranchProbabilityInfo *MBPI = nullptr;
AliasAnalysis *AA = nullptr;
RegisterClassInfo RegClassInfo;
TargetSchedModel SchedModel;

// Remember which edges have been considered for breaking.
SmallSet<std::pair<MachineBasicBlock *, MachineBasicBlock *>, 8>
Expand Down Expand Up @@ -161,6 +165,8 @@ class MachineSinking : public MachineFunctionPass {
/// would re-order assignments.
using SeenDbgUser = PointerIntPair<MachineInstr *, 1>;

using SinkItem = std::pair<MachineInstr *, MachineBasicBlock *>;

/// Record of DBG_VALUE uses of vregs in a block, so that we can identify
/// debug instructions to sink.
SmallDenseMap<unsigned, TinyPtrVector<SeenDbgUser>> SeenDbgUsers;
Expand Down Expand Up @@ -255,7 +261,10 @@ class MachineSinking : public MachineFunctionPass {

void FindCycleSinkCandidates(MachineCycle *Cycle, MachineBasicBlock *BB,
SmallVectorImpl<MachineInstr *> &Candidates);
bool SinkIntoCycle(MachineCycle *Cycle, MachineInstr &I);

bool
aggressivelySinkIntoCycle(MachineCycle *Cycle, MachineInstr &I,
DenseMap<SinkItem, MachineInstr *> &SunkInstrs);

bool isProfitableToSinkTo(Register Reg, MachineInstr &MI,
MachineBasicBlock *MBB,
Expand All @@ -271,11 +280,14 @@ class MachineSinking : public MachineFunctionPass {
GetAllSortedSuccessors(MachineInstr &MI, MachineBasicBlock *MBB,
AllSuccsCache &AllSuccessors) const;

std::vector<unsigned> &getBBRegisterPressure(const MachineBasicBlock &MBB);
std::vector<unsigned> &getBBRegisterPressure(const MachineBasicBlock &MBB,
bool UseCache = true);

bool registerPressureSetExceedsLimit(unsigned NRegs,
const TargetRegisterClass *RC,
const MachineBasicBlock &MBB);

bool registerPressureExceedsLimit(const MachineBasicBlock &MBB);
};

} // end anonymous namespace
Expand Down Expand Up @@ -680,6 +692,10 @@ void MachineSinking::FindCycleSinkCandidates(
SmallVectorImpl<MachineInstr *> &Candidates) {
for (auto &MI : *BB) {
LLVM_DEBUG(dbgs() << "CycleSink: Analysing candidate: " << MI);
if (MI.isMetaInstruction()) {
LLVM_DEBUG(dbgs() << "CycleSink: not sinking meta instruction\n");
continue;
}
if (!TII->shouldSink(MI)) {
LLVM_DEBUG(dbgs() << "CycleSink: Instruction not a candidate for this "
"target\n");
Expand Down Expand Up @@ -775,31 +791,62 @@ bool MachineSinking::runOnMachineFunction(MachineFunction &MF) {

if (SinkInstsIntoCycle) {
SmallVector<MachineCycle *, 8> Cycles(CI->toplevel_cycles());
for (auto *Cycle : Cycles) {
MachineBasicBlock *Preheader = Cycle->getCyclePreheader();
if (!Preheader) {
LLVM_DEBUG(dbgs() << "CycleSink: Can't find preheader\n");
continue;
}
SmallVector<MachineInstr *, 8> Candidates;
FindCycleSinkCandidates(Cycle, Preheader, Candidates);

// Walk the candidates in reverse order so that we start with the use
// of a def-use chain, if there is any.
// TODO: Sort the candidates using a cost-model.
unsigned i = 0;
for (MachineInstr *I : llvm::reverse(Candidates)) {
if (i++ == SinkIntoCycleLimit) {
LLVM_DEBUG(dbgs() << "CycleSink: Limit reached of instructions to "
"be analysed.");
break;
SchedModel.init(STI);
bool HasHighPressure;

DenseMap<SinkItem, MachineInstr *> SunkInstrs;

enum CycleSinkStage { COPY, LOW_LATENCY, AGGRESSIVE, END };
for (unsigned Stage = CycleSinkStage::COPY; Stage != CycleSinkStage::END;
++Stage, SunkInstrs.clear()) {
HasHighPressure = false;

for (auto *Cycle : Cycles) {
MachineBasicBlock *Preheader = Cycle->getCyclePreheader();
if (!Preheader) {
LLVM_DEBUG(dbgs() << "CycleSink: Can't find preheader\n");
continue;
}
SmallVector<MachineInstr *, 8> Candidates;
FindCycleSinkCandidates(Cycle, Preheader, Candidates);

unsigned i = 0;

// Walk the candidates in reverse order so that we start with the use
// of a def-use chain, if there is any.
// TODO: Sort the candidates using a cost-model.
for (MachineInstr *I : llvm::reverse(Candidates)) {
// CycleSinkStage::COPY: Sink a limited number of copies
if (Stage == CycleSinkStage::COPY) {
if (i++ == SinkIntoCycleLimit) {
LLVM_DEBUG(dbgs()
<< "CycleSink: Limit reached of instructions to "
"be analyzed.");
break;
}

if (!I->isCopy())
continue;
}

if (!SinkIntoCycle(Cycle, *I))
break;
EverMadeChange = true;
++NumCycleSunk;
// CycleSinkStage::LOW_LATENCY: sink unlimited number of instructions
// which the target specifies as low-latency
if (Stage == CycleSinkStage::LOW_LATENCY &&
!TII->hasLowDefLatency(SchedModel, *I, 0))
continue;

if (!aggressivelySinkIntoCycle(Cycle, *I, SunkInstrs))
break;
EverMadeChange = true;
++NumCycleSunk;
}

// Recalculate the pressure after sinking
if (!HasHighPressure)
HasHighPressure = registerPressureExceedsLimit(*Preheader);
}
if (!HasHighPressure)
break;
}
}

Expand Down Expand Up @@ -1055,13 +1102,15 @@ bool MachineSinking::PostponeSplitCriticalEdge(MachineInstr &MI,
}

std::vector<unsigned> &
MachineSinking::getBBRegisterPressure(const MachineBasicBlock &MBB) {
MachineSinking::getBBRegisterPressure(const MachineBasicBlock &MBB,
bool UseCache) {
// Currently to save compiling time, MBB's register pressure will not change
// in one ProcessBlock iteration because of CachedRegisterPressure. but MBB's
// register pressure is changed after sinking any instructions into it.
// FIXME: need a accurate and cheap register pressure estiminate model here.

auto RP = CachedRegisterPressure.find(&MBB);
if (RP != CachedRegisterPressure.end())
if (UseCache && RP != CachedRegisterPressure.end())
return RP->second;

RegionPressure Pressure;
Expand All @@ -1085,6 +1134,12 @@ MachineSinking::getBBRegisterPressure(const MachineBasicBlock &MBB) {
}

RPTracker.closeRegion();

if (RP != CachedRegisterPressure.end()) {
CachedRegisterPressure[&MBB] = RPTracker.getPressure().MaxSetPressure;
return CachedRegisterPressure[&MBB];
}

auto It = CachedRegisterPressure.insert(
std::make_pair(&MBB, RPTracker.getPressure().MaxSetPressure));
return It.first->second;
Expand All @@ -1103,6 +1158,21 @@ bool MachineSinking::registerPressureSetExceedsLimit(
return false;
}

// Recalculate RP and check if any pressure set exceeds the set limit.
bool MachineSinking::registerPressureExceedsLimit(
const MachineBasicBlock &MBB) {
std::vector<unsigned> BBRegisterPressure = getBBRegisterPressure(MBB, false);

for (unsigned PS = 0; PS < BBRegisterPressure.size(); ++PS) {
if (BBRegisterPressure[PS] >=
TRI->getRegPressureSetLimit(*MBB.getParent(), PS)) {
return true;
}
}

return false;
}

/// isProfitableToSinkTo - Return true if it is profitable to sink MI.
bool MachineSinking::isProfitableToSinkTo(Register Reg, MachineInstr &MI,
MachineBasicBlock *MBB,
Expand Down Expand Up @@ -1581,83 +1651,98 @@ bool MachineSinking::hasStoreBetween(MachineBasicBlock *From,
return HasAliasedStore;
}

/// Sink instructions into cycles if profitable. This especially tries to
/// prevent register spills caused by register pressure if there is little to no
/// overhead moving instructions into cycles.
bool MachineSinking::SinkIntoCycle(MachineCycle *Cycle, MachineInstr &I) {
LLVM_DEBUG(dbgs() << "CycleSink: Finding sink block for: " << I);
/// Aggressively sink instructions into cycles. This will aggressively try to
/// sink all instructions in the top-most preheaders in an attempt to reduce RP.
/// In particular, it will sink into multiple successor blocks without limits
/// based on the amount of sinking, or the type of ops being sunk (so long as
/// they are safe to sink).
bool MachineSinking::aggressivelySinkIntoCycle(
MachineCycle *Cycle, MachineInstr &I,
DenseMap<SinkItem, MachineInstr *> &SunkInstrs) {
// TODO: support instructions with multiple defs
if (I.getNumDefs() > 1)
return false;

LLVM_DEBUG(dbgs() << "AggressiveCycleSink: Finding sink block for: " << I);
MachineBasicBlock *Preheader = Cycle->getCyclePreheader();
assert(Preheader && "Cycle sink needs a preheader block");
MachineBasicBlock *SinkBlock = nullptr;
bool CanSink = true;
const MachineOperand &MO = I.getOperand(0);

for (MachineInstr &MI : MRI->use_instructions(MO.getReg())) {
LLVM_DEBUG(dbgs() << "CycleSink: Analysing use: " << MI);
if (!Cycle->contains(MI.getParent())) {
LLVM_DEBUG(dbgs() << "CycleSink: Use not in cycle, can't sink.\n");
CanSink = false;
break;
}
SmallVector<std::pair<RegSubRegPair, MachineInstr *>> Uses;

// FIXME: Come up with a proper cost model that estimates whether sinking
// the instruction (and thus possibly executing it on every cycle
// iteration) is more expensive than a register.
// For now assumes that copies are cheap and thus almost always worth it.
if (!MI.isCopy()) {
LLVM_DEBUG(dbgs() << "CycleSink: Use is not a copy\n");
CanSink = false;
break;
MachineOperand &DefMO = I.getOperand(0);
for (MachineInstr &MI : MRI->use_instructions(DefMO.getReg())) {
Uses.push_back({{DefMO.getReg(), DefMO.getSubReg()}, &MI});
}

for (std::pair<RegSubRegPair, MachineInstr *> Entry : Uses) {
MachineInstr *MI = Entry.second;
LLVM_DEBUG(dbgs() << "AggressiveCycleSink: Analysing use: " << MI);
if (MI->isPHI()) {
LLVM_DEBUG(
dbgs() << "AggressiveCycleSink: Not attempting to sink for PHI.\n");
continue;
}
if (!SinkBlock) {
SinkBlock = MI.getParent();
LLVM_DEBUG(dbgs() << "CycleSink: Setting sink block to: "
<< printMBBReference(*SinkBlock) << "\n");
// We cannot sink before the prologue
if (MI->isPosition() || TII->isBasicBlockPrologue(*MI)) {
LLVM_DEBUG(dbgs() << "AggressiveCycleSink: Use is BasicBlock prologue, "
"can't sink.\n");
continue;
}
SinkBlock = DT->findNearestCommonDominator(SinkBlock, MI.getParent());
if (!SinkBlock) {
LLVM_DEBUG(dbgs() << "CycleSink: Can't find nearest dominator\n");
CanSink = false;
break;
if (!Cycle->contains(MI->getParent())) {
LLVM_DEBUG(
dbgs() << "AggressiveCycleSink: Use not in cycle, can't sink.\n");
continue;
}
LLVM_DEBUG(dbgs() << "CycleSink: Setting nearest common dom block: "
<< printMBBReference(*SinkBlock) << "\n");
}

if (!CanSink) {
LLVM_DEBUG(dbgs() << "CycleSink: Can't sink instruction.\n");
return false;
}
if (!SinkBlock) {
LLVM_DEBUG(dbgs() << "CycleSink: Not sinking, can't find sink block.\n");
return false;
}
if (SinkBlock == Preheader) {
LLVM_DEBUG(
dbgs() << "CycleSink: Not sinking, sink block is the preheader\n");
return false;
}
if (SinkBlock->sizeWithoutDebugLargerThan(SinkLoadInstsPerBlockThreshold)) {
LLVM_DEBUG(
dbgs() << "CycleSink: Not Sinking, block too large to analyse.\n");
return false;
}
MachineBasicBlock *SinkBlock = MI->getParent();
MachineInstr *NewMI = nullptr;
SinkItem MapEntry(&I, SinkBlock);

auto SI = SunkInstrs.find(MapEntry);

// Check for the case in which we have already sunk a copy of this
// instruction into the user block.
if (SI != SunkInstrs.end()) {
LLVM_DEBUG(dbgs() << "AggressiveCycleSink: Already sunk to block: "
<< printMBBReference(*SinkBlock) << "\n");
NewMI = SI->second;
}

LLVM_DEBUG(dbgs() << "CycleSink: Sinking instruction!\n");
SinkBlock->splice(SinkBlock->SkipPHIsAndLabels(SinkBlock->begin()), Preheader,
I);
// Create a copy of the instruction in the use block.
if (!NewMI) {
LLVM_DEBUG(dbgs() << "AggressiveCycleSink: Sinking instruction to block: "
<< printMBBReference(*SinkBlock) << "\n");

NewMI = I.getMF()->CloneMachineInstr(&I);
if (DefMO.getReg().isVirtual()) {
const TargetRegisterClass *TRC = MRI->getRegClass(DefMO.getReg());
Register DestReg = MRI->createVirtualRegister(TRC);
NewMI->substituteRegister(DefMO.getReg(), DestReg, DefMO.getSubReg(),
*TRI);
}
SinkBlock->insert(SinkBlock->SkipPHIsAndLabels(SinkBlock->begin()),
NewMI);
SunkInstrs.insert({MapEntry, NewMI});
}

// Conservatively clear any kill flags on uses of sunk instruction
for (MachineOperand &MO : I.operands()) {
if (MO.isReg() && MO.readsReg())
// Conservatively clear any kill flags on uses of sunk instruction
for (MachineOperand &MO : NewMI->all_uses()) {
assert(MO.isReg() && MO.isUse());
RegsToClearKillFlags.insert(MO.getReg());
}
}

// The instruction is moved from its basic block, so do not retain the
// debug information.
assert(!I.isDebugInstr() && "Should not sink debug inst");
I.setDebugLoc(DebugLoc());
// The instruction is moved from its basic block, so do not retain the
// debug information.
assert(!NewMI->isDebugInstr() && "Should not sink debug inst");
NewMI->setDebugLoc(DebugLoc());

// Replace the use with the newly created virtual register.
RegSubRegPair &UseReg = Entry.first;
MI->substituteRegister(UseReg.Reg, NewMI->getOperand(0).getReg(),
UseReg.SubReg, *TRI);
}
// If we have replaced all uses, then delete the dead instruction
if (I.isDead(*MRI))
I.eraseFromParent();
return true;
}

Expand Down
Loading
Loading