llvm
diff --git a/‎llvm/lib/Target/AMDGPU/AMDGPUCodeGenPrepare.cpp
Lines changed: 111 additions & 0 deletions b/‎llvm/lib/Target/AMDGPU/AMDGPUCodeGenPrepare.cpp
Lines changed: 111 additions & 0 deletions
@@ -47,6 +47,16 @@ static cl::opt<bool> Widen16BitOps(
   cl::ReallyHidden,
   cl::init(true));
 
+static cl::opt<bool>
+    ScalarizeLargePHIs("amdgpu-codegenprepare-break-large-phis",
+                       cl::desc("Break large PHI nodes for DAGISel"),
+                       cl::ReallyHidden, cl::init(true));
+
+static cl::opt<unsigned> ScalarizeLargePHIsThreshold(
+    "amdgpu-codegenprepare-break-large-phis-threshold",
+    cl::desc("Minimum type size in bits for breaking large PHI nodes"),
+    cl::ReallyHidden, cl::init(32));
+
 static cl::opt<bool> UseMul24Intrin(
   "amdgpu-codegenprepare-mul24",
   cl::desc("Introduce mul24 intrinsics in AMDGPUCodeGenPrepare"),
@@ -213,6 +223,7 @@ class AMDGPUCodeGenPrepare : public FunctionPass,
   bool visitLoadInst(LoadInst &I);
   bool visitICmpInst(ICmpInst &I);
   bool visitSelectInst(SelectInst &I);
+  bool visitPHINode(PHINode &I);
 
   bool visitIntrinsicInst(IntrinsicInst &I);
   bool visitBitreverseIntrinsicInst(IntrinsicInst &I);
@@ -1383,6 +1394,106 @@ bool AMDGPUCodeGenPrepare::visitSelectInst(SelectInst &I) {
   return Changed;
 }
 
+bool AMDGPUCodeGenPrepare::visitPHINode(PHINode &I) {
+  // Break-up fixed-vector PHIs into smaller pieces.
+  // Default threshold is 32, so it breaks up any vector that's >32 bits into
+  // its elements, or into 32-bit pieces (for 8/16 bit elts).
+  //
+  // This is only helpful for DAGISel because it doesn't handle large PHIs as
+  // well as GlobalISel. DAGISel lowers PHIs by using CopyToReg/CopyFromReg.
+  // With large, odd-sized PHIs we may end up needing many `build_vector`
+  // operations with most elements being "undef". This inhibits a lot of
+  // optimization opportunities and can result in unreasonably high register
+  // pressure and the inevitable stack spilling.
+  if (!ScalarizeLargePHIs || getCGPassBuilderOption().EnableGlobalISelOption)
+    return false;
+
+  FixedVectorType *FVT = dyn_cast<FixedVectorType>(I.getType());
+  if (!FVT || DL->getTypeSizeInBits(FVT) <= ScalarizeLargePHIsThreshold)
+    return false;
+
+  struct VectorSlice {
+    Type *Ty = nullptr;
+    unsigned Idx = 0;
+    unsigned NumElts = 0;
+    std::vector<Value *> IncomingValues = {};
+    PHINode *NewPHI = nullptr;
+  };
+
+  std::vector<VectorSlice> Slices;
+
+  Type *EltTy = FVT->getElementType();
+  {
+    unsigned Idx = 0;
+    // For 8/16 bits type, don't scalarize fully but break it up into as many
+    // 32-bit slices as we can, and scalarize the tail.
+    const unsigned EltSize = DL->getTypeSizeInBits(EltTy);
+    const unsigned NumElts = FVT->getNumElements();
+    if (EltSize == 8 || EltSize == 16) {
+      const unsigned SubVecSize = (32 / EltSize);
+      Type *SubVecTy = FixedVectorType::get(EltTy, SubVecSize);
+      for (unsigned End = alignDown(NumElts, SubVecSize); Idx < End;
+           Idx += SubVecSize)
+        Slices.push_back(VectorSlice{SubVecTy, Idx, SubVecSize});
+    }
+
+    // Scalarize all remaining elements.
+    for (; Idx < NumElts; ++Idx)
+      Slices.push_back(VectorSlice{EltTy, Idx, 1});
+  }
+
+  if (Slices.size() == 1)
+    return false;
+
+  // Break up this PHI's incoming values.
+  for (unsigned Idx = 0; Idx < I.getNumIncomingValues(); ++Idx) {
+    Value *Inc = I.getIncomingValue(Idx);
+
+    IRBuilder<> B(I.getIncomingBlock(Idx)->getTerminator());
+    if (Instruction *IncInst = dyn_cast<Instruction>(Inc))
+      B.SetCurrentDebugLocation(IncInst->getDebugLoc());
+
+    unsigned NameSuffix = 0;
+    for (VectorSlice &S : Slices) {
+      const auto ValName =
+          "largephi.extractslice" + std::to_string(NameSuffix++);
+      if (S.NumElts > 1) {
+        SmallVector<int, 4> Mask;
+        for (unsigned K = S.Idx; K < (S.Idx + S.NumElts); ++K)
+          Mask.push_back(K);
+        S.IncomingValues.push_back(B.CreateShuffleVector(Inc, Mask, ValName));
+      } else
+        S.IncomingValues.push_back(B.CreateExtractElement(Inc, S.Idx, ValName));
+    }
+  }
+
+  // Now create one PHI per vector piece.
+  IRBuilder<> B(I.getParent()->getFirstNonPHI());
+  B.SetCurrentDebugLocation(I.getDebugLoc());
+
+  for (VectorSlice &S : Slices) {
+    S.NewPHI = B.CreatePHI(S.Ty, I.getNumIncomingValues());
+    for (const auto &[Idx, BB] : enumerate(I.blocks()))
+      S.NewPHI->addIncoming(S.IncomingValues[Idx], BB);
+  }
+
+  // And replace this PHI with a vector of all the previous PHI values.
+  Value *Vec = PoisonValue::get(FVT);
+  unsigned NameSuffix = 0;
+  for (VectorSlice &S : Slices) {
+    const auto ValName = "largephi.insertslice" + std::to_string(NameSuffix++);
+    if (S.NumElts > 1)
+      Vec =
+          B.CreateInsertVector(FVT, Vec, S.NewPHI, B.getInt64(S.Idx), ValName);
+    else
+      Vec = B.CreateInsertElement(Vec, S.NewPHI, S.Idx, ValName);
+  }
+
+  I.replaceAllUsesWith(Vec);
+  I.eraseFromParent();
+  return true;
+}
+
 bool AMDGPUCodeGenPrepare::visitIntrinsicInst(IntrinsicInst &I) {
   switch (I.getIntrinsicID()) {
   case Intrinsic::bitreverse: