Skip to content

Commit 21d78b9

Browse files
committed
[AMDGPU] Filter candidates of LiveRegOptimizer for profitable cases
It is known that for vector whose element fits in i16 will be split and scalarized in SelectionDag's type legalizer (see SIISelLowering::getPreferredVectorAction). LRO attempts to undo the scalarizing of vectors across basic block boundary and shoehorn Values in VGPRs. LRO is beneficial for operations that natively work on illegal vector types to prevent flip-flopping between SGPR and VGPR. If we know that operations on vector will be split and scalarized, then we don't want to shoehorn them back to VGPR. Operations that we know to work natively on illegal vector types usually come in the form of intrinsics (MFMA, DOT8), buffer store, shuffle, phi nodes to name a few.
1 parent c24e5f9 commit 21d78b9

File tree

7 files changed

+4204
-592
lines changed

7 files changed

+4204
-592
lines changed

llvm/lib/Target/AMDGPU/AMDGPULateCodeGenPrepare.cpp

Lines changed: 50 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -14,6 +14,7 @@
1414

1515
#include "AMDGPU.h"
1616
#include "AMDGPUTargetMachine.h"
17+
#include "AMDGPUTargetTransformInfo.h"
1718
#include "llvm/Analysis/AssumptionCache.h"
1819
#include "llvm/Analysis/UniformityAnalysis.h"
1920
#include "llvm/Analysis/ValueTracking.h"
@@ -45,6 +46,7 @@ class AMDGPULateCodeGenPrepare
4546
Function &F;
4647
const DataLayout &DL;
4748
const GCNSubtarget &ST;
49+
const TargetTransformInfo &TTI;
4850

4951
AssumptionCache *const AC;
5052
UniformityInfo &UA;
@@ -53,8 +55,9 @@ class AMDGPULateCodeGenPrepare
5355

5456
public:
5557
AMDGPULateCodeGenPrepare(Function &F, const GCNSubtarget &ST,
56-
AssumptionCache *AC, UniformityInfo &UA)
57-
: F(F), DL(F.getDataLayout()), ST(ST), AC(AC), UA(UA) {}
58+
const TargetTransformInfo &TTI, AssumptionCache *AC,
59+
UniformityInfo &UA)
60+
: F(F), DL(F.getDataLayout()), ST(ST), TTI(TTI), AC(AC), UA(UA) {}
5861
bool run();
5962
bool visitInstruction(Instruction &) { return false; }
6063

@@ -75,6 +78,8 @@ class LiveRegOptimizer {
7578
Module &Mod;
7679
const DataLayout &DL;
7780
const GCNSubtarget &ST;
81+
const TargetTransformInfo &TTI;
82+
7883
/// The scalar type to convert to
7984
Type *const ConvertToScalar;
8085
/// The set of visited Instructions
@@ -125,8 +130,41 @@ class LiveRegOptimizer {
125130
return LK.first != TargetLoweringBase::TypeLegal;
126131
}
127132

128-
LiveRegOptimizer(Module &Mod, const GCNSubtarget &ST)
129-
: Mod(Mod), DL(Mod.getDataLayout()), ST(ST),
133+
// Filtering based on operation or its cost.
134+
// If an operation incurs high enough cost or natively work on
135+
// vector of illegal type, ie. v2i8, then it makes sense to try
136+
// to avoid scalarizing across BB.
137+
bool shouldReplaceBasedOnOp(Instruction *II) {
138+
// Ignore pseudos
139+
if (II->isDebugOrPseudoInst())
140+
return false;
141+
142+
// Instruction Cost
143+
const auto Cost = TTI.getInstructionCost(
144+
II, TargetTransformInfo::TargetCostKind::TCK_SizeAndLatency);
145+
LLVM_DEBUG(dbgs() << "shouldReplaceBasedOnOp: " << *II << " Cost=" << Cost
146+
<< '\n';);
147+
if (Cost >= 8)
148+
return true;
149+
150+
// Intrinsics - assume they natively handle illegal type
151+
if (dyn_cast<IntrinsicInst>(II))
152+
return true;
153+
154+
// Stores
155+
if (dyn_cast<StoreInst>(II))
156+
return true;
157+
158+
// Shuffles
159+
if (dyn_cast<ShuffleVectorInst>(II))
160+
return true;
161+
162+
return false;
163+
}
164+
165+
LiveRegOptimizer(Module &Mod, const GCNSubtarget &ST,
166+
const TargetTransformInfo &TTI)
167+
: Mod(Mod), DL(Mod.getDataLayout()), ST(ST), TTI(TTI),
130168
ConvertToScalar(Type::getInt32Ty(Mod.getContext())) {}
131169
};
132170

@@ -140,7 +178,7 @@ bool AMDGPULateCodeGenPrepare::run() {
140178
// vectors to equivalent vectors of legal type (which are converted back
141179
// before uses in subsequent blocks), to pack the bits into fewer physical
142180
// registers (used in CopyToReg/CopyFromReg pairs).
143-
LiveRegOptimizer LRO(*F.getParent(), ST);
181+
LiveRegOptimizer LRO(*F.getParent(), ST, TTI);
144182

145183
bool Changed = false;
146184

@@ -259,6 +297,9 @@ bool LiveRegOptimizer::optimizeLiveType(
259297
if (!shouldReplace(II->getType()))
260298
continue;
261299

300+
if (!shouldReplaceBasedOnOp(II))
301+
continue;
302+
262303
if (PHINode *Phi = dyn_cast<PHINode>(II)) {
263304
PhiNodes.insert(Phi);
264305
// Collect all the incoming values of problematic PHI nodes.
@@ -478,11 +519,12 @@ bool AMDGPULateCodeGenPrepare::visitLoadInst(LoadInst &LI) {
478519
PreservedAnalyses
479520
AMDGPULateCodeGenPreparePass::run(Function &F, FunctionAnalysisManager &FAM) {
480521
const GCNSubtarget &ST = TM.getSubtarget<GCNSubtarget>(F);
522+
const TargetTransformInfo &TTI = TM.getTargetTransformInfo(F);
481523

482524
AssumptionCache &AC = FAM.getResult<AssumptionAnalysis>(F);
483525
UniformityInfo &UI = FAM.getResult<UniformityInfoAnalysis>(F);
484526

485-
bool Changed = AMDGPULateCodeGenPrepare(F, ST, &AC, UI).run();
527+
bool Changed = AMDGPULateCodeGenPrepare(F, ST, TTI, &AC, UI).run();
486528

487529
if (!Changed)
488530
return PreservedAnalyses::all();
@@ -518,13 +560,14 @@ bool AMDGPULateCodeGenPrepareLegacy::runOnFunction(Function &F) {
518560
const TargetPassConfig &TPC = getAnalysis<TargetPassConfig>();
519561
const TargetMachine &TM = TPC.getTM<TargetMachine>();
520562
const GCNSubtarget &ST = TM.getSubtarget<GCNSubtarget>(F);
563+
const TargetTransformInfo &TTI = TM.getTargetTransformInfo(F);
521564

522565
AssumptionCache &AC =
523566
getAnalysis<AssumptionCacheTracker>().getAssumptionCache(F);
524567
UniformityInfo &UI =
525568
getAnalysis<UniformityInfoWrapperPass>().getUniformityInfo();
526569

527-
return AMDGPULateCodeGenPrepare(F, ST, &AC, UI).run();
570+
return AMDGPULateCodeGenPrepare(F, ST, TTI, &AC, UI).run();
528571
}
529572

530573
INITIALIZE_PASS_BEGIN(AMDGPULateCodeGenPrepareLegacy, DEBUG_TYPE,

0 commit comments

Comments
 (0)