Kotlin
diff --git a/‎llvm/lib/Target/AMDGPU/AMDGPULateCodeGenPrepare.cpp
Lines changed: 295 additions & 7 deletions b/‎llvm/lib/Target/AMDGPU/AMDGPULateCodeGenPrepare.cpp
Lines changed: 295 additions & 7 deletions
diff --git a/‎llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp
Lines changed: 2 additions & 2 deletions b/‎llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp
Lines changed: 2 additions & 2 deletions
@@ -50,6 +50,8 @@ class AMDGPULateCodeGenPrepare
   AssumptionCache *AC = nullptr;
   UniformityInfo *UA = nullptr;
 
+  SmallVector<WeakTrackingVH, 8> DeadInsts;
+
 public:
   static char ID;
 
@@ -81,6 +83,69 @@ class AMDGPULateCodeGenPrepare
   bool visitLoadInst(LoadInst &LI);
 };
 
+using ValueToValueMap = DenseMap<const Value *, Value *>;
+
+class LiveRegOptimizer {
+private:
+  Module *Mod = nullptr;
+  const DataLayout *DL = nullptr;
+  const GCNSubtarget *ST;
+  /// The scalar type to convert to
+  Type *ConvertToScalar;
+  /// The set of visited Instructions
+  SmallPtrSet<Instruction *, 4> Visited;
+  /// Map of Value -> Converted Value
+  ValueToValueMap ValMap;
+  /// Map of containing conversions from Optimal Type -> Original Type per BB.
+  DenseMap<BasicBlock *, ValueToValueMap> BBUseValMap;
+
+public:
+  /// Calculate the and \p return  the type to convert to given a problematic \p
+  /// OriginalType. In some instances, we may widen the type (e.g. v2i8 -> i32).
+  Type *calculateConvertType(Type *OriginalType);
+  /// Convert the virtual register defined by \p V to the compatible vector of
+  /// legal type
+  Value *convertToOptType(Instruction *V, BasicBlock::iterator &InstPt);
+  /// Convert the virtual register defined by \p V back to the original type \p
+  /// ConvertType, stripping away the MSBs in cases where there was an imperfect
+  /// fit (e.g. v2i32 -> v7i8)
+  Value *convertFromOptType(Type *ConvertType, Instruction *V,
+                            BasicBlock::iterator &InstPt,
+                            BasicBlock *InsertBlock);
+  /// Check for problematic PHI nodes or cross-bb values based on the value
+  /// defined by \p I, and coerce to legal types if necessary. For problematic
+  /// PHI node, we coerce all incoming values in a single invocation.
+  bool optimizeLiveType(Instruction *I,
+                        SmallVectorImpl<WeakTrackingVH> &DeadInsts);
+
+  // Whether or not the type should be replaced to avoid inefficient
+  // legalization code
+  bool shouldReplace(Type *ITy) {
+    FixedVectorType *VTy = dyn_cast<FixedVectorType>(ITy);
+    if (!VTy)
+      return false;
+
+    auto TLI = ST->getTargetLowering();
+
+    Type *EltTy = VTy->getElementType();
+    // If the element size is not less than the convert to scalar size, then we
+    // can't do any bit packing
+    if (!EltTy->isIntegerTy() ||
+        EltTy->getScalarSizeInBits() > ConvertToScalar->getScalarSizeInBits())
+      return false;
+
+    // Only coerce illegal types
+    TargetLoweringBase::LegalizeKind LK =
+        TLI->getTypeConversion(EltTy->getContext(), EVT::getEVT(EltTy, false));
+    return LK.first != TargetLoweringBase::TypeLegal;
+  }
+
+  LiveRegOptimizer(Module *Mod, const GCNSubtarget *ST) : Mod(Mod), ST(ST) {
+    DL = &Mod->getDataLayout();
+    ConvertToScalar = Type::getInt32Ty(Mod->getContext());
+  }
+};
+
 } // end anonymous namespace
 
 bool AMDGPULateCodeGenPrepare::doInitialization(Module &M) {
@@ -96,20 +161,243 @@ bool AMDGPULateCodeGenPrepare::runOnFunction(Function &F) {
   const TargetPassConfig &TPC = getAnalysis<TargetPassConfig>();
   const TargetMachine &TM = TPC.getTM<TargetMachine>();
   const GCNSubtarget &ST = TM.getSubtarget<GCNSubtarget>(F);
-  if (ST.hasScalarSubwordLoads())
-    return false;
 
   AC = &getAnalysis<AssumptionCacheTracker>().getAssumptionCache(F);
   UA = &getAnalysis<UniformityInfoWrapperPass>().getUniformityInfo();
 
+  // "Optimize" the virtual regs that cross basic block boundaries. When
+  // building the SelectionDAG, vectors of illegal types that cross basic blocks
+  // will be scalarized and widened, with each scalar living in its
+  // own register. To work around this, this optimization converts the
+  // vectors to equivalent vectors of legal type (which are converted back
+  // before uses in subsequent blocks), to pack the bits into fewer physical
+  // registers (used in CopyToReg/CopyFromReg pairs).
+  LiveRegOptimizer LRO(Mod, &ST);
+
   bool Changed = false;
-  for (auto &BB : F)
-    for (Instruction &I : llvm::make_early_inc_range(BB))
-      Changed |= visit(I);
 
+  bool HasScalarSubwordLoads = ST.hasScalarSubwordLoads();
+
+  for (auto &BB : reverse(F))
+    for (Instruction &I : make_early_inc_range(reverse(BB))) {
+      Changed |= !HasScalarSubwordLoads && visit(I);
+      Changed |= LRO.optimizeLiveType(&I, DeadInsts);
+    }
+
+  RecursivelyDeleteTriviallyDeadInstructionsPermissive(DeadInsts);
   return Changed;
 }
 
+Type *LiveRegOptimizer::calculateConvertType(Type *OriginalType) {
+  assert(OriginalType->getScalarSizeInBits() <=
+         ConvertToScalar->getScalarSizeInBits());
+
+  FixedVectorType *VTy = cast<FixedVectorType>(OriginalType);
+
+  TypeSize OriginalSize = DL->getTypeSizeInBits(VTy);
+  TypeSize ConvertScalarSize = DL->getTypeSizeInBits(ConvertToScalar);
+  unsigned ConvertEltCount =
+      (OriginalSize + ConvertScalarSize - 1) / ConvertScalarSize;
+
+  if (OriginalSize <= ConvertScalarSize)
+    return IntegerType::get(Mod->getContext(), ConvertScalarSize);
+
+  return VectorType::get(Type::getIntNTy(Mod->getContext(), ConvertScalarSize),
+                         ConvertEltCount, false);
+}
+
+Value *LiveRegOptimizer::convertToOptType(Instruction *V,
+                                          BasicBlock::iterator &InsertPt) {
+  FixedVectorType *VTy = cast<FixedVectorType>(V->getType());
+  Type *NewTy = calculateConvertType(V->getType());
+
+  TypeSize OriginalSize = DL->getTypeSizeInBits(VTy);
+  TypeSize NewSize = DL->getTypeSizeInBits(NewTy);
+
+  IRBuilder<> Builder(V->getParent(), InsertPt);
+  // If there is a bitsize match, we can fit the old vector into a new vector of
+  // desired type.
+  if (OriginalSize == NewSize)
+    return Builder.CreateBitCast(V, NewTy, V->getName() + ".bc");
+
+  // If there is a bitsize mismatch, we must use a wider vector.
+  assert(NewSize > OriginalSize);
+  uint64_t ExpandedVecElementCount = NewSize / VTy->getScalarSizeInBits();
+
+  SmallVector<int, 8> ShuffleMask;
+  uint64_t OriginalElementCount = VTy->getElementCount().getFixedValue();
+  for (unsigned I = 0; I < OriginalElementCount; I++)
+    ShuffleMask.push_back(I);
+
+  for (uint64_t I = OriginalElementCount; I < ExpandedVecElementCount; I++)
+    ShuffleMask.push_back(OriginalElementCount);
+
+  Value *ExpandedVec = Builder.CreateShuffleVector(V, ShuffleMask);
+  return Builder.CreateBitCast(ExpandedVec, NewTy, V->getName() + ".bc");
+}
+
+Value *LiveRegOptimizer::convertFromOptType(Type *ConvertType, Instruction *V,
+                                            BasicBlock::iterator &InsertPt,
+                                            BasicBlock *InsertBB) {
+  FixedVectorType *NewVTy = cast<FixedVectorType>(ConvertType);
+
+  TypeSize OriginalSize = DL->getTypeSizeInBits(V->getType());
+  TypeSize NewSize = DL->getTypeSizeInBits(NewVTy);
+
+  IRBuilder<> Builder(InsertBB, InsertPt);
+  // If there is a bitsize match, we simply convert back to the original type.
+  if (OriginalSize == NewSize)
+    return Builder.CreateBitCast(V, NewVTy, V->getName() + ".bc");
+
+  // If there is a bitsize mismatch, then we must have used a wider value to
+  // hold the bits.
+  assert(OriginalSize > NewSize);
+  // For wide scalars, we can just truncate the value.
+  if (!V->getType()->isVectorTy()) {
+    Instruction *Trunc = cast<Instruction>(
+        Builder.CreateTrunc(V, IntegerType::get(Mod->getContext(), NewSize)));
+    return cast<Instruction>(Builder.CreateBitCast(Trunc, NewVTy));
+  }
+
+  // For wider vectors, we must strip the MSBs to convert back to the original
+  // type.
+  VectorType *ExpandedVT = VectorType::get(
+      Type::getIntNTy(Mod->getContext(), NewVTy->getScalarSizeInBits()),
+      (OriginalSize / NewVTy->getScalarSizeInBits()), false);
+  Instruction *Converted =
+      cast<Instruction>(Builder.CreateBitCast(V, ExpandedVT));
+
+  unsigned NarrowElementCount = NewVTy->getElementCount().getFixedValue();
+  SmallVector<int, 8> ShuffleMask(NarrowElementCount);
+  std::iota(ShuffleMask.begin(), ShuffleMask.end(), 0);
+
+  return Builder.CreateShuffleVector(Converted, ShuffleMask);
+}
+
+bool LiveRegOptimizer::optimizeLiveType(
+    Instruction *I, SmallVectorImpl<WeakTrackingVH> &DeadInsts) {
+  SmallVector<Instruction *, 4> Worklist;
+  SmallPtrSet<PHINode *, 4> PhiNodes;
+  SmallPtrSet<Instruction *, 4> Defs;
+  SmallPtrSet<Instruction *, 4> Uses;
+
+  Worklist.push_back(cast<Instruction>(I));
+  while (!Worklist.empty()) {
+    Instruction *II = Worklist.pop_back_val();
+
+    if (!Visited.insert(II).second)
+      continue;
+
+    if (!shouldReplace(II->getType()))
+      continue;
+
+    if (PHINode *Phi = dyn_cast<PHINode>(II)) {
+      PhiNodes.insert(Phi);
+      // Collect all the incoming values of problematic PHI nodes.
+      for (Value *V : Phi->incoming_values()) {
+        // Repeat the collection process for newly found PHI nodes.
+        if (PHINode *OpPhi = dyn_cast<PHINode>(V)) {
+          if (!PhiNodes.count(OpPhi) && !Visited.count(OpPhi))
+            Worklist.push_back(OpPhi);
+          continue;
+        }
+
+        Instruction *IncInst = dyn_cast<Instruction>(V);
+        // Other incoming value types (e.g. vector literals) are unhandled
+        if (!IncInst && !isa<ConstantAggregateZero>(V))
+          return false;
+
+        // Collect all other incoming values for coercion.
+        if (IncInst)
+          Defs.insert(IncInst);
+      }
+    }
+
+    // Collect all relevant uses.
+    for (User *V : II->users()) {
+      // Repeat the collection process for problematic PHI nodes.
+      if (PHINode *OpPhi = dyn_cast<PHINode>(V)) {
+        if (!PhiNodes.count(OpPhi) && !Visited.count(OpPhi))
+          Worklist.push_back(OpPhi);
+        continue;
+      }
+
+      Instruction *UseInst = cast<Instruction>(V);
+      // Collect all uses of PHINodes and any use the crosses BB boundaries.
+      if (UseInst->getParent() != II->getParent() || isa<PHINode>(II)) {
+        Uses.insert(UseInst);
+        if (!Defs.count(II) && !isa<PHINode>(II)) {
+          Defs.insert(II);
+        }
+      }
+    }
+  }
+
+  // Coerce and track the defs.
+  for (Instruction *D : Defs) {
+    if (!ValMap.contains(D)) {
+      BasicBlock::iterator InsertPt = std::next(D->getIterator());
+      Value *ConvertVal = convertToOptType(D, InsertPt);
+      assert(ConvertVal);
+      ValMap[D] = ConvertVal;
+    }
+  }
+
+  // Construct new-typed PHI nodes.
+  for (PHINode *Phi : PhiNodes) {
+    ValMap[Phi] = PHINode::Create(calculateConvertType(Phi->getType()),
+                                  Phi->getNumIncomingValues(),
+                                  Phi->getName() + ".tc", Phi->getIterator());
+  }
+
+  // Connect all the PHI nodes with their new incoming values.
+  for (PHINode *Phi : PhiNodes) {
+    PHINode *NewPhi = cast<PHINode>(ValMap[Phi]);
+    bool MissingIncVal = false;
+    for (int I = 0, E = Phi->getNumIncomingValues(); I < E; I++) {
+      Value *IncVal = Phi->getIncomingValue(I);
+      if (isa<ConstantAggregateZero>(IncVal)) {
+        Type *NewType = calculateConvertType(Phi->getType());
+        NewPhi->addIncoming(ConstantInt::get(NewType, 0, false),
+                            Phi->getIncomingBlock(I));
+      } else if (ValMap.contains(IncVal))
+        NewPhi->addIncoming(ValMap[IncVal], Phi->getIncomingBlock(I));
+      else
+        MissingIncVal = true;
+    }
+    Instruction *DeadInst = Phi;
+    if (MissingIncVal) {
+      DeadInst = cast<Instruction>(ValMap[Phi]);
+      // Do not use the dead phi
+      ValMap[Phi] = Phi;
+    }
+    DeadInsts.emplace_back(DeadInst);
+  }
+  // Coerce back to the original type and replace the uses.
+  for (Instruction *U : Uses) {
+    // Replace all converted operands for a use.
+    for (auto [OpIdx, Op] : enumerate(U->operands())) {
+      if (ValMap.contains(Op)) {
+        Value *NewVal = nullptr;
+        if (BBUseValMap.contains(U->getParent()) &&
+            BBUseValMap[U->getParent()].contains(ValMap[Op]))
+          NewVal = BBUseValMap[U->getParent()][ValMap[Op]];
+        else {
+          BasicBlock::iterator InsertPt = U->getParent()->getFirstNonPHIIt();
+          NewVal =
+              convertFromOptType(Op->getType(), cast<Instruction>(ValMap[Op]),
+                                 InsertPt, U->getParent());
+          BBUseValMap[U->getParent()][ValMap[Op]] = NewVal;
+        }
+        assert(NewVal);
+        U->setOperand(OpIdx, NewVal);
+      }
+    }
+  }
+
+  return true;
+}
+
 bool AMDGPULateCodeGenPrepare::canWidenScalarExtLoad(LoadInst &LI) const {
   unsigned AS = LI.getPointerAddressSpace();
   // Skip non-constant address space.
@@ -119,7 +407,7 @@ bool AMDGPULateCodeGenPrepare::canWidenScalarExtLoad(LoadInst &LI) const {
   // Skip non-simple loads.
   if (!LI.isSimple())
     return false;
-  auto *Ty = LI.getType();
+  Type *Ty = LI.getType();
   // Skip aggregate types.
   if (Ty->isAggregateType())
     return false;
@@ -181,7 +469,7 @@ bool AMDGPULateCodeGenPrepare::visitLoadInst(LoadInst &LI) {
   auto *NewVal = IRB.CreateBitCast(
       IRB.CreateTrunc(IRB.CreateLShr(NewLd, ShAmt), IntNTy), LI.getType());
   LI.replaceAllUsesWith(NewVal);
-  RecursivelyDeleteTriviallyDeadInstructions(&LI);
+  DeadInsts.emplace_back(&LI);
 
   return true;
 }
 
@@ -1197,10 +1197,10 @@ bool GCNPassConfig::addPreISel() {
   AMDGPUPassConfig::addPreISel();
 
   if (TM->getOptLevel() > CodeGenOptLevel::None)
-    addPass(createAMDGPULateCodeGenPreparePass());
+    addPass(createSinkingPass());
 
   if (TM->getOptLevel() > CodeGenOptLevel::None)
-    addPass(createSinkingPass());
+    addPass(createAMDGPULateCodeGenPreparePass());
 
   // Merge divergent exit nodes. StructurizeCFG won't recognize the multi-exit
   // regions formed by them.