Skip to content

Commit d892521

Browse files
committed
[AMDGPU] Break-up large PHIs for DAGISel
DAGISel uses CopyToReg/CopyFromReg to lower PHI nodes. With large PHIs, this can result in poor codegen. This is because it introduces a need to have a build_vector before copying the PHI value, and that build_vector may have many undef elements. This can cause very high register pressure and abnormal stack usage in some cases. This scalarization/phi "break-up" can be easily tuned/disabled through CL options in case it's not beneficial for some users. It's also only enabled for DAGIsel and GlobalISel handles PHIs much better (as it works on the whole function). This can both scalarize (break a vector into its elements) and simplify (break a vector into smaller, more manageable subvectors) PHIs. Fixes SWDEV-321581 Reviewed By: kzhuravl Differential Revision: https://reviews.llvm.org/D143731
1 parent 7765e5d commit d892521

16 files changed

+2016
-1003
lines changed

llvm/lib/Target/AMDGPU/AMDGPUCodeGenPrepare.cpp

Lines changed: 111 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -47,6 +47,16 @@ static cl::opt<bool> Widen16BitOps(
4747
cl::ReallyHidden,
4848
cl::init(true));
4949

50+
static cl::opt<bool>
51+
ScalarizeLargePHIs("amdgpu-codegenprepare-break-large-phis",
52+
cl::desc("Break large PHI nodes for DAGISel"),
53+
cl::ReallyHidden, cl::init(true));
54+
55+
static cl::opt<unsigned> ScalarizeLargePHIsThreshold(
56+
"amdgpu-codegenprepare-break-large-phis-threshold",
57+
cl::desc("Minimum type size in bits for breaking large PHI nodes"),
58+
cl::ReallyHidden, cl::init(32));
59+
5060
static cl::opt<bool> UseMul24Intrin(
5161
"amdgpu-codegenprepare-mul24",
5262
cl::desc("Introduce mul24 intrinsics in AMDGPUCodeGenPrepare"),
@@ -213,6 +223,7 @@ class AMDGPUCodeGenPrepare : public FunctionPass,
213223
bool visitLoadInst(LoadInst &I);
214224
bool visitICmpInst(ICmpInst &I);
215225
bool visitSelectInst(SelectInst &I);
226+
bool visitPHINode(PHINode &I);
216227

217228
bool visitIntrinsicInst(IntrinsicInst &I);
218229
bool visitBitreverseIntrinsicInst(IntrinsicInst &I);
@@ -1383,6 +1394,106 @@ bool AMDGPUCodeGenPrepare::visitSelectInst(SelectInst &I) {
13831394
return Changed;
13841395
}
13851396

1397+
bool AMDGPUCodeGenPrepare::visitPHINode(PHINode &I) {
1398+
// Break-up fixed-vector PHIs into smaller pieces.
1399+
// Default threshold is 32, so it breaks up any vector that's >32 bits into
1400+
// its elements, or into 32-bit pieces (for 8/16 bit elts).
1401+
//
1402+
// This is only helpful for DAGISel because it doesn't handle large PHIs as
1403+
// well as GlobalISel. DAGISel lowers PHIs by using CopyToReg/CopyFromReg.
1404+
// With large, odd-sized PHIs we may end up needing many `build_vector`
1405+
// operations with most elements being "undef". This inhibits a lot of
1406+
// optimization opportunities and can result in unreasonably high register
1407+
// pressure and the inevitable stack spilling.
1408+
if (!ScalarizeLargePHIs || getCGPassBuilderOption().EnableGlobalISelOption)
1409+
return false;
1410+
1411+
FixedVectorType *FVT = dyn_cast<FixedVectorType>(I.getType());
1412+
if (!FVT || DL->getTypeSizeInBits(FVT) <= ScalarizeLargePHIsThreshold)
1413+
return false;
1414+
1415+
struct VectorSlice {
1416+
Type *Ty = nullptr;
1417+
unsigned Idx = 0;
1418+
unsigned NumElts = 0;
1419+
std::vector<Value *> IncomingValues = {};
1420+
PHINode *NewPHI = nullptr;
1421+
};
1422+
1423+
std::vector<VectorSlice> Slices;
1424+
1425+
Type *EltTy = FVT->getElementType();
1426+
{
1427+
unsigned Idx = 0;
1428+
// For 8/16 bits type, don't scalarize fully but break it up into as many
1429+
// 32-bit slices as we can, and scalarize the tail.
1430+
const unsigned EltSize = DL->getTypeSizeInBits(EltTy);
1431+
const unsigned NumElts = FVT->getNumElements();
1432+
if (EltSize == 8 || EltSize == 16) {
1433+
const unsigned SubVecSize = (32 / EltSize);
1434+
Type *SubVecTy = FixedVectorType::get(EltTy, SubVecSize);
1435+
for (unsigned End = alignDown(NumElts, SubVecSize); Idx < End;
1436+
Idx += SubVecSize)
1437+
Slices.push_back(VectorSlice{SubVecTy, Idx, SubVecSize});
1438+
}
1439+
1440+
// Scalarize all remaining elements.
1441+
for (; Idx < NumElts; ++Idx)
1442+
Slices.push_back(VectorSlice{EltTy, Idx, 1});
1443+
}
1444+
1445+
if (Slices.size() == 1)
1446+
return false;
1447+
1448+
// Break up this PHI's incoming values.
1449+
for (unsigned Idx = 0; Idx < I.getNumIncomingValues(); ++Idx) {
1450+
Value *Inc = I.getIncomingValue(Idx);
1451+
1452+
IRBuilder<> B(I.getIncomingBlock(Idx)->getTerminator());
1453+
if (Instruction *IncInst = dyn_cast<Instruction>(Inc))
1454+
B.SetCurrentDebugLocation(IncInst->getDebugLoc());
1455+
1456+
unsigned NameSuffix = 0;
1457+
for (VectorSlice &S : Slices) {
1458+
const auto ValName =
1459+
"largephi.extractslice" + std::to_string(NameSuffix++);
1460+
if (S.NumElts > 1) {
1461+
SmallVector<int, 4> Mask;
1462+
for (unsigned K = S.Idx; K < (S.Idx + S.NumElts); ++K)
1463+
Mask.push_back(K);
1464+
S.IncomingValues.push_back(B.CreateShuffleVector(Inc, Mask, ValName));
1465+
} else
1466+
S.IncomingValues.push_back(B.CreateExtractElement(Inc, S.Idx, ValName));
1467+
}
1468+
}
1469+
1470+
// Now create one PHI per vector piece.
1471+
IRBuilder<> B(I.getParent()->getFirstNonPHI());
1472+
B.SetCurrentDebugLocation(I.getDebugLoc());
1473+
1474+
for (VectorSlice &S : Slices) {
1475+
S.NewPHI = B.CreatePHI(S.Ty, I.getNumIncomingValues());
1476+
for (const auto &[Idx, BB] : enumerate(I.blocks()))
1477+
S.NewPHI->addIncoming(S.IncomingValues[Idx], BB);
1478+
}
1479+
1480+
// And replace this PHI with a vector of all the previous PHI values.
1481+
Value *Vec = PoisonValue::get(FVT);
1482+
unsigned NameSuffix = 0;
1483+
for (VectorSlice &S : Slices) {
1484+
const auto ValName = "largephi.insertslice" + std::to_string(NameSuffix++);
1485+
if (S.NumElts > 1)
1486+
Vec =
1487+
B.CreateInsertVector(FVT, Vec, S.NewPHI, B.getInt64(S.Idx), ValName);
1488+
else
1489+
Vec = B.CreateInsertElement(Vec, S.NewPHI, S.Idx, ValName);
1490+
}
1491+
1492+
I.replaceAllUsesWith(Vec);
1493+
I.eraseFromParent();
1494+
return true;
1495+
}
1496+
13861497
bool AMDGPUCodeGenPrepare::visitIntrinsicInst(IntrinsicInst &I) {
13871498
switch (I.getIntrinsicID()) {
13881499
case Intrinsic::bitreverse:

0 commit comments

Comments
 (0)