@@ -52,6 +52,12 @@ static cl::opt<bool>
52
52
cl::desc (" Break large PHI nodes for DAGISel" ),
53
53
cl::ReallyHidden, cl::init(true ));
54
54
55
+ static cl::opt<bool >
56
+ ForceScalarizeLargePHIs (" amdgpu-codegenprepare-force-break-large-phis" ,
57
+ cl::desc (" For testing purposes, always break large "
58
+ " PHIs even if it isn't profitable." ),
59
+ cl::ReallyHidden, cl::init(false ));
60
+
55
61
static cl::opt<unsigned > ScalarizeLargePHIsThreshold (
56
62
" amdgpu-codegenprepare-break-large-phis-threshold" ,
57
63
cl::desc (" Minimum type size in bits for breaking large PHI nodes" ),
@@ -1394,6 +1400,50 @@ bool AMDGPUCodeGenPrepare::visitSelectInst(SelectInst &I) {
1394
1400
return Changed;
1395
1401
}
1396
1402
1403
+ // Helper for breaking large PHIs that returns true when an extractelement on V
1404
+ // is likely to be folded away by the DAG combiner.
1405
+ static bool isInterestingPHIIncomingValue (Value *V, FixedVectorType *FVT) {
1406
+ InsertElementInst *IE = dyn_cast<InsertElementInst>(V);
1407
+
1408
+ // Constants & InsertElements chains are interesting.
1409
+ if (!IE)
1410
+ return isa<Constant>(V);
1411
+
1412
+ // Check if this is a simple chain of insertelement that fills the vector. If
1413
+ // that's the case, we can break up this PHI node profitably because the
1414
+ // extractelement we will insert will get folded out.
1415
+ BasicBlock *BB = IE->getParent ();
1416
+ BitVector EltsCovered (FVT->getNumElements ());
1417
+ InsertElementInst *Next = IE;
1418
+ while (Next && !EltsCovered.all ()) {
1419
+ ConstantInt *Idx = dyn_cast<ConstantInt>(Next->getOperand (2 ));
1420
+
1421
+ // Non constant index/out of bounds index -> folding is unlikely.
1422
+ // Note that this is more of a sanity check - canonical IR should
1423
+ // already have replaced those with poison.
1424
+ if (!Idx || Idx->getSExtValue () >= FVT->getNumElements ())
1425
+ return false ;
1426
+
1427
+ EltsCovered.set (Idx->getSExtValue ());
1428
+
1429
+ // If the insertelement chain ends with a constant, it's fine.
1430
+ if (isa<Constant>(Next->getOperand (0 )))
1431
+ return true ;
1432
+
1433
+ Next = dyn_cast<InsertElementInst>(Next->getOperand (0 ));
1434
+
1435
+ // If the chain is spread across basic blocks, the DAG combiner
1436
+ // won't see it in its entirety and is unlikely to be able to fold
1437
+ // evevrything away.
1438
+ if (Next && Next->getParent () != BB)
1439
+ return false ;
1440
+ }
1441
+
1442
+ // All elements covered, all of the extract elements will likely be
1443
+ // combined.
1444
+ return EltsCovered.all ();
1445
+ }
1446
+
1397
1447
bool AMDGPUCodeGenPrepare::visitPHINode (PHINode &I) {
1398
1448
// Break-up fixed-vector PHIs into smaller pieces.
1399
1449
// Default threshold is 32, so it breaks up any vector that's >32 bits into
@@ -1412,6 +1462,24 @@ bool AMDGPUCodeGenPrepare::visitPHINode(PHINode &I) {
1412
1462
if (!FVT || DL->getTypeSizeInBits (FVT) <= ScalarizeLargePHIsThreshold)
1413
1463
return false ;
1414
1464
1465
+ // Try to avoid unprofitable cases:
1466
+ // - Don't break PHIs that have no interesting incoming values. That is, where
1467
+ // there is no clear opportunity to fold the "extractelement" instructions we
1468
+ // would add.
1469
+ // - Note: IC does not run after this pass, so we're only interested in the
1470
+ // folding that the DAG combiner can do.
1471
+ // - For simplicity, don't break PHIs that are used by other PHIs because it'd
1472
+ // require us to determine if the whole "chain" can be converted or not. e.g.
1473
+ // if we broke this PHI but not its user, we would actually make things worse.
1474
+ if (!ForceScalarizeLargePHIs) {
1475
+ if (none_of (
1476
+ I.incoming_values (),
1477
+ [&](Value *V) { return isInterestingPHIIncomingValue (V, FVT); }) ||
1478
+ any_of (I.users (), [&](User *U) { return isa<PHINode>(U); })) {
1479
+ return false ;
1480
+ }
1481
+ }
1482
+
1415
1483
struct VectorSlice {
1416
1484
Type *Ty = nullptr ;
1417
1485
unsigned Idx = 0 ;
0 commit comments