Skip to content

Commit b3b3cb2

Browse files
committed
[AMDGPU] Less aggressively break large PHIs
In some cases, breaking large PHIs can very negatively affect performance (3x more instructions observed in a particular test case). This patch adds some basic profitability heuristics to help with some of these issues without affecting the "good" cases. e.g. avoid breaking PHIs if it causes back-and-forth between vector/scalar form for no good reason. Fixes SWDEV-392803 Fixes SWDEV-393781 Fixes SWDEV-394228 Reviewed By: arsenm Differential Revision: https://reviews.llvm.org/D147786
1 parent 5c60a08 commit b3b3cb2

10 files changed

+925
-454
lines changed

llvm/lib/Target/AMDGPU/AMDGPUCodeGenPrepare.cpp

Lines changed: 68 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -52,6 +52,12 @@ static cl::opt<bool>
5252
cl::desc("Break large PHI nodes for DAGISel"),
5353
cl::ReallyHidden, cl::init(true));
5454

55+
static cl::opt<bool>
56+
ForceScalarizeLargePHIs("amdgpu-codegenprepare-force-break-large-phis",
57+
cl::desc("For testing purposes, always break large "
58+
"PHIs even if it isn't profitable."),
59+
cl::ReallyHidden, cl::init(false));
60+
5561
static cl::opt<unsigned> ScalarizeLargePHIsThreshold(
5662
"amdgpu-codegenprepare-break-large-phis-threshold",
5763
cl::desc("Minimum type size in bits for breaking large PHI nodes"),
@@ -1394,6 +1400,50 @@ bool AMDGPUCodeGenPrepare::visitSelectInst(SelectInst &I) {
13941400
return Changed;
13951401
}
13961402

1403+
// Helper for breaking large PHIs that returns true when an extractelement on V
1404+
// is likely to be folded away by the DAG combiner.
1405+
static bool isInterestingPHIIncomingValue(Value *V, FixedVectorType *FVT) {
1406+
InsertElementInst *IE = dyn_cast<InsertElementInst>(V);
1407+
1408+
// Constants & InsertElements chains are interesting.
1409+
if (!IE)
1410+
return isa<Constant>(V);
1411+
1412+
// Check if this is a simple chain of insertelement that fills the vector. If
1413+
// that's the case, we can break up this PHI node profitably because the
1414+
// extractelement we will insert will get folded out.
1415+
BasicBlock *BB = IE->getParent();
1416+
BitVector EltsCovered(FVT->getNumElements());
1417+
InsertElementInst *Next = IE;
1418+
while (Next && !EltsCovered.all()) {
1419+
ConstantInt *Idx = dyn_cast<ConstantInt>(Next->getOperand(2));
1420+
1421+
// Non constant index/out of bounds index -> folding is unlikely.
1422+
// Note that this is more of a sanity check - canonical IR should
1423+
// already have replaced those with poison.
1424+
if (!Idx || Idx->getSExtValue() >= FVT->getNumElements())
1425+
return false;
1426+
1427+
EltsCovered.set(Idx->getSExtValue());
1428+
1429+
// If the insertelement chain ends with a constant, it's fine.
1430+
if (isa<Constant>(Next->getOperand(0)))
1431+
return true;
1432+
1433+
Next = dyn_cast<InsertElementInst>(Next->getOperand(0));
1434+
1435+
// If the chain is spread across basic blocks, the DAG combiner
1436+
// won't see it in its entirety and is unlikely to be able to fold
1437+
// evevrything away.
1438+
if (Next && Next->getParent() != BB)
1439+
return false;
1440+
}
1441+
1442+
// All elements covered, all of the extract elements will likely be
1443+
// combined.
1444+
return EltsCovered.all();
1445+
}
1446+
13971447
bool AMDGPUCodeGenPrepare::visitPHINode(PHINode &I) {
13981448
// Break-up fixed-vector PHIs into smaller pieces.
13991449
// Default threshold is 32, so it breaks up any vector that's >32 bits into
@@ -1412,6 +1462,24 @@ bool AMDGPUCodeGenPrepare::visitPHINode(PHINode &I) {
14121462
if (!FVT || DL->getTypeSizeInBits(FVT) <= ScalarizeLargePHIsThreshold)
14131463
return false;
14141464

1465+
// Try to avoid unprofitable cases:
1466+
// - Don't break PHIs that have no interesting incoming values. That is, where
1467+
// there is no clear opportunity to fold the "extractelement" instructions we
1468+
// would add.
1469+
// - Note: IC does not run after this pass, so we're only interested in the
1470+
// folding that the DAG combiner can do.
1471+
// - For simplicity, don't break PHIs that are used by other PHIs because it'd
1472+
// require us to determine if the whole "chain" can be converted or not. e.g.
1473+
// if we broke this PHI but not its user, we would actually make things worse.
1474+
if (!ForceScalarizeLargePHIs) {
1475+
if (none_of(
1476+
I.incoming_values(),
1477+
[&](Value *V) { return isInterestingPHIIncomingValue(V, FVT); }) ||
1478+
any_of(I.users(), [&](User *U) { return isa<PHINode>(U); })) {
1479+
return false;
1480+
}
1481+
}
1482+
14151483
struct VectorSlice {
14161484
Type *Ty = nullptr;
14171485
unsigned Idx = 0;

0 commit comments

Comments
 (0)