Skip to content

Commit 94334e4

Browse files
committed
[AMDGPU] Filter candidates of LiveRegOptimizer for profitable cases
It is known that for vector whose element fits in i16 will be split and scalarized in SelectionDag's type legalizer (see SIISelLowering::getPreferredVectorAction). LRO attempts to undo the scalarizing of vectors across basic block boundary and shoehorn Values in VGPRs. LRO is beneficial for operations that natively work on illegal vector types to prevent flip-flopping between SGPR and VGPR. If we know that operations on vector will be split and scalarized, then we don't want to shoehorn them back to VGPR. Operations that we know to work natively on illegal vector types usually come in the form of intrinsics (MFMA, DOT8), buffer store, shuffle, phi nodes to name a few.
1 parent c24e5f9 commit 94334e4

File tree

7 files changed

+4205
-591
lines changed

7 files changed

+4205
-591
lines changed

llvm/lib/Target/AMDGPU/AMDGPULateCodeGenPrepare.cpp

Lines changed: 51 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -14,6 +14,7 @@
1414

1515
#include "AMDGPU.h"
1616
#include "AMDGPUTargetMachine.h"
17+
#include "AMDGPUTargetTransformInfo.h"
1718
#include "llvm/Analysis/AssumptionCache.h"
1819
#include "llvm/Analysis/UniformityAnalysis.h"
1920
#include "llvm/Analysis/ValueTracking.h"
@@ -45,6 +46,7 @@ class AMDGPULateCodeGenPrepare
4546
Function &F;
4647
const DataLayout &DL;
4748
const GCNSubtarget &ST;
49+
const TargetTransformInfo &TTI;
4850

4951
AssumptionCache *const AC;
5052
UniformityInfo &UA;
@@ -53,8 +55,9 @@ class AMDGPULateCodeGenPrepare
5355

5456
public:
5557
AMDGPULateCodeGenPrepare(Function &F, const GCNSubtarget &ST,
58+
const TargetTransformInfo &TTI,
5659
AssumptionCache *AC, UniformityInfo &UA)
57-
: F(F), DL(F.getDataLayout()), ST(ST), AC(AC), UA(UA) {}
60+
: F(F), DL(F.getDataLayout()), ST(ST), TTI(TTI), AC(AC), UA(UA) {}
5861
bool run();
5962
bool visitInstruction(Instruction &) { return false; }
6063

@@ -75,6 +78,8 @@ class LiveRegOptimizer {
7578
Module &Mod;
7679
const DataLayout &DL;
7780
const GCNSubtarget &ST;
81+
const TargetTransformInfo &TTI;
82+
7883
/// The scalar type to convert to
7984
Type *const ConvertToScalar;
8085
/// The set of visited Instructions
@@ -125,8 +130,43 @@ class LiveRegOptimizer {
125130
return LK.first != TargetLoweringBase::TypeLegal;
126131
}
127132

128-
LiveRegOptimizer(Module &Mod, const GCNSubtarget &ST)
129-
: Mod(Mod), DL(Mod.getDataLayout()), ST(ST),
133+
// Filtering based on operation or its cost.
134+
// If an operation incurs high enough cost or natively work on
135+
// vector of illegal type, ie. v2i8, then it makes sense to try
136+
// to avoid scalarizing across BB.
137+
bool shouldReplaceBasedOnOp(Instruction *II) {
138+
// Ignore pseudos
139+
if (II->isDebugOrPseudoInst())
140+
return false;
141+
142+
// Instruction Cost
143+
const auto Cost = TTI.getInstructionCost(II,
144+
TargetTransformInfo::TargetCostKind::TCK_SizeAndLatency);
145+
LLVM_DEBUG(
146+
dbgs() << "shouldReplaceBasedOnOp: " <<
147+
*II << " Cost=" << Cost << '\n';
148+
);
149+
if (Cost >= 8)
150+
return true;
151+
152+
// Intrinsics - assume they natively handle illegal type
153+
if (dyn_cast<IntrinsicInst>(II))
154+
return true;
155+
156+
// Stores
157+
if (dyn_cast<StoreInst>(II))
158+
return true;
159+
160+
// Shuffles
161+
if (dyn_cast<ShuffleVectorInst>(II))
162+
return true;
163+
164+
return false;
165+
}
166+
167+
LiveRegOptimizer(Module &Mod, const GCNSubtarget &ST,
168+
const TargetTransformInfo &TTI)
169+
: Mod(Mod), DL(Mod.getDataLayout()), ST(ST), TTI(TTI),
130170
ConvertToScalar(Type::getInt32Ty(Mod.getContext())) {}
131171
};
132172

@@ -140,7 +180,7 @@ bool AMDGPULateCodeGenPrepare::run() {
140180
// vectors to equivalent vectors of legal type (which are converted back
141181
// before uses in subsequent blocks), to pack the bits into fewer physical
142182
// registers (used in CopyToReg/CopyFromReg pairs).
143-
LiveRegOptimizer LRO(*F.getParent(), ST);
183+
LiveRegOptimizer LRO(*F.getParent(), ST, TTI);
144184

145185
bool Changed = false;
146186

@@ -259,6 +299,9 @@ bool LiveRegOptimizer::optimizeLiveType(
259299
if (!shouldReplace(II->getType()))
260300
continue;
261301

302+
if (!shouldReplaceBasedOnOp(II))
303+
continue;
304+
262305
if (PHINode *Phi = dyn_cast<PHINode>(II)) {
263306
PhiNodes.insert(Phi);
264307
// Collect all the incoming values of problematic PHI nodes.
@@ -478,11 +521,12 @@ bool AMDGPULateCodeGenPrepare::visitLoadInst(LoadInst &LI) {
478521
PreservedAnalyses
479522
AMDGPULateCodeGenPreparePass::run(Function &F, FunctionAnalysisManager &FAM) {
480523
const GCNSubtarget &ST = TM.getSubtarget<GCNSubtarget>(F);
524+
const TargetTransformInfo &TTI = TM.getTargetTransformInfo(F);
481525

482526
AssumptionCache &AC = FAM.getResult<AssumptionAnalysis>(F);
483527
UniformityInfo &UI = FAM.getResult<UniformityInfoAnalysis>(F);
484528

485-
bool Changed = AMDGPULateCodeGenPrepare(F, ST, &AC, UI).run();
529+
bool Changed = AMDGPULateCodeGenPrepare(F, ST, TTI, &AC, UI).run();
486530

487531
if (!Changed)
488532
return PreservedAnalyses::all();
@@ -518,13 +562,14 @@ bool AMDGPULateCodeGenPrepareLegacy::runOnFunction(Function &F) {
518562
const TargetPassConfig &TPC = getAnalysis<TargetPassConfig>();
519563
const TargetMachine &TM = TPC.getTM<TargetMachine>();
520564
const GCNSubtarget &ST = TM.getSubtarget<GCNSubtarget>(F);
565+
const TargetTransformInfo &TTI = TM.getTargetTransformInfo(F);
521566

522567
AssumptionCache &AC =
523568
getAnalysis<AssumptionCacheTracker>().getAssumptionCache(F);
524569
UniformityInfo &UI =
525570
getAnalysis<UniformityInfoWrapperPass>().getUniformityInfo();
526571

527-
return AMDGPULateCodeGenPrepare(F, ST, &AC, UI).run();
572+
return AMDGPULateCodeGenPrepare(F, ST, TTI, &AC, UI).run();
528573
}
529574

530575
INITIALIZE_PASS_BEGIN(AMDGPULateCodeGenPrepareLegacy, DEBUG_TYPE,

0 commit comments

Comments
 (0)