llvm
diff --git a/‎llvm/include/llvm/Analysis/TargetTransformInfo.h
Lines changed: 7 additions & 0 deletions b/‎llvm/include/llvm/Analysis/TargetTransformInfo.h
Lines changed: 7 additions & 0 deletions
diff --git a/‎llvm/include/llvm/Analysis/TargetTransformInfoImpl.h
Lines changed: 2 additions & 0 deletions b/‎llvm/include/llvm/Analysis/TargetTransformInfoImpl.h
Lines changed: 2 additions & 0 deletions
diff --git a/‎llvm/lib/Analysis/TargetTransformInfo.cpp
Lines changed: 4 additions & 0 deletions b/‎llvm/lib/Analysis/TargetTransformInfo.cpp
Lines changed: 4 additions & 0 deletions
diff --git a/‎llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp
Lines changed: 22 additions & 3 deletions b/‎llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp
Lines changed: 22 additions & 3 deletions
diff --git a/‎llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.h
Lines changed: 9 additions & 0 deletions b/‎llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.h
Lines changed: 9 additions & 0 deletions
diff --git a/‎llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
Lines changed: 15 additions & 4 deletions b/‎llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
Lines changed: 15 additions & 4 deletions
@@ -1907,6 +1907,10 @@ class TargetTransformInfo {
   /// pad to. Default is no padding.
   unsigned getNumBytesToPadGlobalArray(unsigned Size, Type *ArrayType) const;
 
+  /// \return Returns true if vectorizing 4 x i8s into an i32 is possible.
+  /// Currently only used by the SLP vectorizer.
+  bool canVectorizei8s() const;
+
   /// @}
 
   /// Collect kernel launch bounds for \p F into \p LB.
@@ -2363,6 +2367,7 @@ class TargetTransformInfo::Concept {
   virtual void collectKernelLaunchBounds(
       const Function &F,
       SmallVectorImpl<std::pair<StringRef, int64_t>> &LB) const = 0;
+  virtual bool canVectorizei8s() const = 0;
 };
 
 template <typename T>
@@ -3229,6 +3234,8 @@ class TargetTransformInfo::Model final : public TargetTransformInfo::Concept {
       SmallVectorImpl<std::pair<StringRef, int64_t>> &LB) const override {
     Impl.collectKernelLaunchBounds(F, LB);
   }
+
+  bool canVectorizei8s() const override { return Impl.canVectorizei8s(); }
 };
 
 template <typename T>
 
@@ -1063,6 +1063,8 @@ class TargetTransformInfoImplBase {
 
   unsigned getMaxNumArgs() const { return UINT_MAX; }
 
+  bool canVectorizei8s() const { return false; }
+
   unsigned getNumBytesToPadGlobalArray(unsigned Size, Type *ArrayType) const {
     return 0;
   }
 
@@ -1419,6 +1419,10 @@ unsigned TargetTransformInfo::getMaxNumArgs() const {
   return TTIImpl->getMaxNumArgs();
 }
 
+bool TargetTransformInfo::canVectorizei8s() const {
+  return TTIImpl->canVectorizei8s();
+}
+
 bool TargetTransformInfo::shouldExpandReduction(const IntrinsicInst *II) const {
   return TTIImpl->shouldExpandReduction(II);
 }
 
@@ -344,9 +344,10 @@ unsigned GCNTTIImpl::getMinVectorRegisterBitWidth() const {
 unsigned GCNTTIImpl::getMaximumVF(unsigned ElemWidth, unsigned Opcode) const {
   if (Opcode == Instruction::Load || Opcode == Instruction::Store)
     return 32 * 4 / ElemWidth;
-  return (ElemWidth == 16 && ST->has16BitInsts()) ? 2
-       : (ElemWidth == 32 && ST->hasPackedFP32Ops()) ? 2
-       : 1;
+  return ElemWidth == 8                                ? 4
+         : (ElemWidth == 16 && ST->has16BitInsts())    ? 2
+         : (ElemWidth == 32 && ST->hasPackedFP32Ops()) ? 2
+                                                       : 1;
 }
 
 unsigned GCNTTIImpl::getLoadVectorFactor(unsigned VF, unsigned LoadSize,
@@ -1120,6 +1121,17 @@ Value *GCNTTIImpl::rewriteIntrinsicWithAddressSpace(IntrinsicInst *II,
   }
 }
 
+InstructionCost GCNTTIImpl::getScalarizationOverhead(
+    VectorType *InTy, const APInt &DemandedElts, bool Insert, bool Extract,
+    TTI::TargetCostKind CostKind, ArrayRef<Value *> VL) {
+  unsigned NumVectorElts = cast<FixedVectorType>(InTy)->getNumElements();
+  if (NumVectorElts > 1 &&
+      InTy->getElementType() == IntegerType::getInt8Ty(InTy->getContext()))
+    return 0;
+  return BaseT::getScalarizationOverhead(InTy, DemandedElts, Insert, Extract,
+                                         CostKind, VL);
+}
+
 InstructionCost GCNTTIImpl::getShuffleCost(TTI::ShuffleKind Kind,
                                            VectorType *VT, ArrayRef<int> Mask,
                                            TTI::TargetCostKind CostKind,
@@ -1134,6 +1146,11 @@ InstructionCost GCNTTIImpl::getShuffleCost(TTI::ShuffleKind Kind,
   // Larger vector widths may require additional instructions, but are
   // typically cheaper than scalarized versions.
   unsigned NumVectorElts = cast<FixedVectorType>(VT)->getNumElements();
+
+  if (NumVectorElts > 1 &&
+      VT->getElementType() == IntegerType::getInt8Ty(VT->getContext()))
+    return 0;
+
   if (ST->getGeneration() >= AMDGPUSubtarget::VOLCANIC_ISLANDS &&
       DL.getTypeSizeInBits(VT->getElementType()) == 16) {
     bool HasVOP3P = ST->hasVOP3PInsts();
@@ -1423,3 +1440,5 @@ void GCNTTIImpl::collectKernelLaunchBounds(
   LB.push_back({"amdgpu-waves-per-eu[0]", WavesPerEU.first});
   LB.push_back({"amdgpu-waves-per-eu[1]", WavesPerEU.second});
 }
+
+bool GCNTTIImpl::canVectorizei8s() const { return true; }
@@ -240,6 +240,12 @@ class GCNTTIImpl final : public BasicTTIImplBase<GCNTTIImpl> {
 
   InstructionCost getVectorSplitCost() { return 0; }
 
+  InstructionCost getScalarizationOverhead(VectorType *InTy,
+                                           const APInt &DemandedElts,
+                                           bool Insert, bool Extract,
+                                           TTI::TargetCostKind CostKind,
+                                           ArrayRef<Value *> VL = {});
+
   InstructionCost getShuffleCost(TTI::ShuffleKind Kind, VectorType *Tp,
                                  ArrayRef<int> Mask,
                                  TTI::TargetCostKind CostKind, int Index,
@@ -282,6 +288,9 @@ class GCNTTIImpl final : public BasicTTIImplBase<GCNTTIImpl> {
   void collectKernelLaunchBounds(
       const Function &F,
       SmallVectorImpl<std::pair<StringRef, int64_t>> &LB) const;
+
+  /// \return return true if we can pack 4 i8s into an i32.
+  bool canVectorizei8s() const;
 };
 
 } // end namespace llvm
 
@@ -12971,9 +12971,16 @@ BoUpSLP::getEntryCost(const TreeEntry *E, ArrayRef<Value *> VectorizedVals,
               LI0->getPointerAddressSpace(), CostKind);
 
         } else {
-          VecLdCost = TTI->getMemoryOpCost(
-              Instruction::Load, VecTy, LI0->getAlign(),
-              LI0->getPointerAddressSpace(), CostKind, TTI::OperandValueInfo());
+          if (VecTy->getElementType() ==
+                  IntegerType::getInt8Ty(VecTy->getContext()) &&
+              TTI->canVectorizei8s()) {
+            VecLdCost = 1;
+          } else {
+            VecLdCost =
+                TTI->getMemoryOpCost(Instruction::Load, VecTy, LI0->getAlign(),
+                                     LI0->getPointerAddressSpace(), CostKind,
+                                     TTI::OperandValueInfo());
+          }
         }
         break;
       case TreeEntry::StridedVectorize: {
@@ -20927,7 +20934,11 @@ bool SLPVectorizerPass::tryToVectorizeList(ArrayRef<Value *> VL, BoUpSLP &R,
     // provided vectorization factor (i.e. the scalar type is used for vector
     // code during codegen).
     auto *VecTy = getWidenedType(ScalarTy, VF);
-    if (TTI->getNumberOfParts(VecTy) == VF)
+    unsigned NumParts = TTI->getNumberOfParts(VecTy);
+    if (TTI->canVectorizei8s() &&
+        VecTy->getElementType() == IntegerType::getInt8Ty(VecTy->getContext()))
+      NumParts = 1;
+    if (NumParts == VF)
       continue;
     for (unsigned I = NextInst; I < MaxInst; ++I) {
       unsigned ActualVF = std::min(MaxInst - I, VF);
Original file line number	Diff line number	Diff line change
`@@ -1063,6 +1063,8 @@ class TargetTransformInfoImplBase {`
`1063`	`1063`
`1064`	`1064`	`unsigned getMaxNumArgs() const { return UINT_MAX; }`
`1065`	`1065`
	`1066`	`+ bool canVectorizei8s() const { return false; }`
	`1067`	`+`
`1066`	`1068`	`unsigned getNumBytesToPadGlobalArray(unsigned Size, Type *ArrayType) const {`
`1067`	`1069`	`return 0;`
`1068`	`1070`	`}`
Original file line number	Diff line number	Diff line change
`@@ -1419,6 +1419,10 @@ unsigned TargetTransformInfo::getMaxNumArgs() const {`
`1419`	`1419`	`return TTIImpl->getMaxNumArgs();`
`1420`	`1420`	`}`
`1421`	`1421`
	`1422`	`+bool TargetTransformInfo::canVectorizei8s() const {`
	`1423`	`+ return TTIImpl->canVectorizei8s();`
	`1424`	`+}`
	`1425`	`+`
`1422`	`1426`	`bool TargetTransformInfo::shouldExpandReduction(const IntrinsicInst *II) const {`
`1423`	`1427`	`return TTIImpl->shouldExpandReduction(II);`
`1424`	`1428`	`}`