@@ -47,6 +47,16 @@ static cl::opt<bool> Widen16BitOps(
47
47
cl::ReallyHidden,
48
48
cl::init(true ));
49
49
50
+ static cl::opt<bool >
51
+ ScalarizeLargePHIs (" amdgpu-codegenprepare-break-large-phis" ,
52
+ cl::desc (" Break large PHI nodes for DAGISel" ),
53
+ cl::ReallyHidden, cl::init(true ));
54
+
55
+ static cl::opt<unsigned > ScalarizeLargePHIsThreshold (
56
+ " amdgpu-codegenprepare-break-large-phis-threshold" ,
57
+ cl::desc (" Minimum type size in bits for breaking large PHI nodes" ),
58
+ cl::ReallyHidden, cl::init(32 ));
59
+
50
60
static cl::opt<bool > UseMul24Intrin (
51
61
" amdgpu-codegenprepare-mul24" ,
52
62
cl::desc (" Introduce mul24 intrinsics in AMDGPUCodeGenPrepare" ),
@@ -213,6 +223,7 @@ class AMDGPUCodeGenPrepare : public FunctionPass,
213
223
bool visitLoadInst (LoadInst &I);
214
224
bool visitICmpInst (ICmpInst &I);
215
225
bool visitSelectInst (SelectInst &I);
226
+ bool visitPHINode (PHINode &I);
216
227
217
228
bool visitIntrinsicInst (IntrinsicInst &I);
218
229
bool visitBitreverseIntrinsicInst (IntrinsicInst &I);
@@ -1383,6 +1394,106 @@ bool AMDGPUCodeGenPrepare::visitSelectInst(SelectInst &I) {
1383
1394
return Changed;
1384
1395
}
1385
1396
1397
+ bool AMDGPUCodeGenPrepare::visitPHINode (PHINode &I) {
1398
+ // Break-up fixed-vector PHIs into smaller pieces.
1399
+ // Default threshold is 32, so it breaks up any vector that's >32 bits into
1400
+ // its elements, or into 32-bit pieces (for 8/16 bit elts).
1401
+ //
1402
+ // This is only helpful for DAGISel because it doesn't handle large PHIs as
1403
+ // well as GlobalISel. DAGISel lowers PHIs by using CopyToReg/CopyFromReg.
1404
+ // With large, odd-sized PHIs we may end up needing many `build_vector`
1405
+ // operations with most elements being "undef". This inhibits a lot of
1406
+ // optimization opportunities and can result in unreasonably high register
1407
+ // pressure and the inevitable stack spilling.
1408
+ if (!ScalarizeLargePHIs || getCGPassBuilderOption ().EnableGlobalISelOption )
1409
+ return false ;
1410
+
1411
+ FixedVectorType *FVT = dyn_cast<FixedVectorType>(I.getType ());
1412
+ if (!FVT || DL->getTypeSizeInBits (FVT) <= ScalarizeLargePHIsThreshold)
1413
+ return false ;
1414
+
1415
+ struct VectorSlice {
1416
+ Type *Ty = nullptr ;
1417
+ unsigned Idx = 0 ;
1418
+ unsigned NumElts = 0 ;
1419
+ std::vector<Value *> IncomingValues = {};
1420
+ PHINode *NewPHI = nullptr ;
1421
+ };
1422
+
1423
+ std::vector<VectorSlice> Slices;
1424
+
1425
+ Type *EltTy = FVT->getElementType ();
1426
+ {
1427
+ unsigned Idx = 0 ;
1428
+ // For 8/16 bits type, don't scalarize fully but break it up into as many
1429
+ // 32-bit slices as we can, and scalarize the tail.
1430
+ const unsigned EltSize = DL->getTypeSizeInBits (EltTy);
1431
+ const unsigned NumElts = FVT->getNumElements ();
1432
+ if (EltSize == 8 || EltSize == 16 ) {
1433
+ const unsigned SubVecSize = (32 / EltSize);
1434
+ Type *SubVecTy = FixedVectorType::get (EltTy, SubVecSize);
1435
+ for (unsigned End = alignDown (NumElts, SubVecSize); Idx < End;
1436
+ Idx += SubVecSize)
1437
+ Slices.push_back (VectorSlice{SubVecTy, Idx, SubVecSize});
1438
+ }
1439
+
1440
+ // Scalarize all remaining elements.
1441
+ for (; Idx < NumElts; ++Idx)
1442
+ Slices.push_back (VectorSlice{EltTy, Idx, 1 });
1443
+ }
1444
+
1445
+ if (Slices.size () == 1 )
1446
+ return false ;
1447
+
1448
+ // Break up this PHI's incoming values.
1449
+ for (unsigned Idx = 0 ; Idx < I.getNumIncomingValues (); ++Idx) {
1450
+ Value *Inc = I.getIncomingValue (Idx);
1451
+
1452
+ IRBuilder<> B (I.getIncomingBlock (Idx)->getTerminator ());
1453
+ if (Instruction *IncInst = dyn_cast<Instruction>(Inc))
1454
+ B.SetCurrentDebugLocation (IncInst->getDebugLoc ());
1455
+
1456
+ unsigned NameSuffix = 0 ;
1457
+ for (VectorSlice &S : Slices) {
1458
+ const auto ValName =
1459
+ " largephi.extractslice" + std::to_string (NameSuffix++);
1460
+ if (S.NumElts > 1 ) {
1461
+ SmallVector<int , 4 > Mask;
1462
+ for (unsigned K = S.Idx ; K < (S.Idx + S.NumElts ); ++K)
1463
+ Mask.push_back (K);
1464
+ S.IncomingValues .push_back (B.CreateShuffleVector (Inc, Mask, ValName));
1465
+ } else
1466
+ S.IncomingValues .push_back (B.CreateExtractElement (Inc, S.Idx , ValName));
1467
+ }
1468
+ }
1469
+
1470
+ // Now create one PHI per vector piece.
1471
+ IRBuilder<> B (I.getParent ()->getFirstNonPHI ());
1472
+ B.SetCurrentDebugLocation (I.getDebugLoc ());
1473
+
1474
+ for (VectorSlice &S : Slices) {
1475
+ S.NewPHI = B.CreatePHI (S.Ty , I.getNumIncomingValues ());
1476
+ for (const auto &[Idx, BB] : enumerate(I.blocks ()))
1477
+ S.NewPHI ->addIncoming (S.IncomingValues [Idx], BB);
1478
+ }
1479
+
1480
+ // And replace this PHI with a vector of all the previous PHI values.
1481
+ Value *Vec = PoisonValue::get (FVT);
1482
+ unsigned NameSuffix = 0 ;
1483
+ for (VectorSlice &S : Slices) {
1484
+ const auto ValName = " largephi.insertslice" + std::to_string (NameSuffix++);
1485
+ if (S.NumElts > 1 )
1486
+ Vec =
1487
+ B.CreateInsertVector (FVT, Vec, S.NewPHI , B.getInt64 (S.Idx ), ValName);
1488
+ else
1489
+ Vec = B.CreateInsertElement (Vec, S.NewPHI , S.Idx , ValName);
1490
+ }
1491
+
1492
+ I.replaceAllUsesWith (Vec);
1493
+ I.eraseFromParent ();
1494
+ return true ;
1495
+ }
1496
+
1386
1497
bool AMDGPUCodeGenPrepare::visitIntrinsicInst (IntrinsicInst &I) {
1387
1498
switch (I.getIntrinsicID ()) {
1388
1499
case Intrinsic::bitreverse:
0 commit comments