llvm
diff --git a/‎llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp
Lines changed: 37 additions & 6 deletions b/‎llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp
Lines changed: 37 additions & 6 deletions
diff --git a/‎llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.h
Lines changed: 14 additions & 0 deletions b/‎llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.h
Lines changed: 14 additions & 0 deletions
diff --git a/‎llvm/test/Analysis/CostModel/AMDGPU/add-sub.ll
Lines changed: 14 additions & 14 deletions b/‎llvm/test/Analysis/CostModel/AMDGPU/add-sub.ll
Lines changed: 14 additions & 14 deletions
@@ -344,9 +344,10 @@ unsigned GCNTTIImpl::getMinVectorRegisterBitWidth() const {
 unsigned GCNTTIImpl::getMaximumVF(unsigned ElemWidth, unsigned Opcode) const {
   if (Opcode == Instruction::Load || Opcode == Instruction::Store)
     return 32 * 4 / ElemWidth;
-  return (ElemWidth == 16 && ST->has16BitInsts()) ? 2
-       : (ElemWidth == 32 && ST->hasPackedFP32Ops()) ? 2
-       : 1;
+  return ElemWidth == 8                                ? 4
+         : (ElemWidth == 16 && ST->has16BitInsts())    ? 2
+         : (ElemWidth == 32 && ST->hasPackedFP32Ops()) ? 2
+                                                       : 1;
 }
 
 unsigned GCNTTIImpl::getLoadVectorFactor(unsigned VF, unsigned LoadSize,
@@ -537,6 +538,12 @@ InstructionCost GCNTTIImpl::getArithmeticInstrCost(
 
   MVT::SimpleValueType SLT = LT.second.getScalarType().SimpleTy;
 
+  VectorType *VecTy = dyn_cast<VectorType>(Ty);
+  InstructionCost LTTypeCost = LT.first;
+  if (VecTy &&
+      VecTy->getElementType() == IntegerType::getInt8Ty(VecTy->getContext()))
+    LTTypeCost = (((LT.first - 1) / 4) + 1);
+
   switch (ISD) {
   case ISD::SHL:
   case ISD::SRL:
@@ -548,7 +555,7 @@ InstructionCost GCNTTIImpl::getArithmeticInstrCost(
       NElts = (NElts + 1) / 2;
 
     // i32
-    return getFullRateInstrCost() * LT.first * NElts;
+    return getFullRateInstrCost() * LTTypeCost * NElts;
   case ISD::ADD:
   case ISD::SUB:
   case ISD::AND:
@@ -562,7 +569,7 @@ InstructionCost GCNTTIImpl::getArithmeticInstrCost(
     if (ST->has16BitInsts() && SLT == MVT::i16)
       NElts = (NElts + 1) / 2;
 
-    return LT.first * NElts * getFullRateInstrCost();
+    return LTTypeCost * NElts * getFullRateInstrCost();
   case ISD::MUL: {
     const int QuarterRateCost = getQuarterRateInstrCost(CostKind);
     if (SLT == MVT::i64) {
@@ -574,7 +581,7 @@ InstructionCost GCNTTIImpl::getArithmeticInstrCost(
       NElts = (NElts + 1) / 2;
 
     // i32
-    return QuarterRateCost * NElts * LT.first;
+    return QuarterRateCost * NElts * LTTypeCost;
   }
   case ISD::FMUL:
     // Check possible fuse {fadd|fsub}(a,fmul(b,c)) and return zero cost for
@@ -1423,3 +1430,27 @@ void GCNTTIImpl::collectKernelLaunchBounds(
   LB.push_back({"amdgpu-waves-per-eu[0]", WavesPerEU.first});
   LB.push_back({"amdgpu-waves-per-eu[1]", WavesPerEU.second});
 }
+
+InstructionCost GCNTTIImpl::getMemoryOpCost(unsigned Opcode, Type *Src,
+                                            Align Alignment,
+                                            unsigned AddressSpace,
+                                            TTI::TargetCostKind CostKind,
+                                            TTI::OperandValueInfo OpInfo,
+                                            const Instruction *I) {
+  if (VectorType *VecTy = dyn_cast<VectorType>(Src))
+    if (Opcode == Instruction::Load && VecTy->getElementType() == IntegerType::getInt8Ty(VecTy->getContext())) {
+      unsigned ElementCount = VecTy->getElementCount().getFixedValue();
+      return ((ElementCount - 1) / 4) + 1;
+    }
+  return BaseT::getMemoryOpCost(Opcode, Src, Alignment, AddressSpace, CostKind,
+                                OpInfo, I);
+}
+
+unsigned GCNTTIImpl::getNumberOfParts(Type *Tp) {
+  if (VectorType *VecTy = dyn_cast<VectorType>(Tp))
+    if (VecTy->getElementType() == IntegerType::getInt8Ty(VecTy->getContext())) {
+      unsigned ElementCount = VecTy->getElementCount().getFixedValue();
+      return ((ElementCount - 1) / 4) + 1;
+    }
+  return BaseT::getNumberOfParts(Tp);
+}
@@ -282,6 +282,20 @@ class GCNTTIImpl final : public BasicTTIImplBase<GCNTTIImpl> {
   void collectKernelLaunchBounds(
       const Function &F,
       SmallVectorImpl<std::pair<StringRef, int64_t>> &LB) const;
+
+  /// Account for loads of i8 vector types to have reduced cost. For
+  /// example the cost of load 4 i8s values is one is the cost of loading
+  /// a single i32 value.
+  InstructionCost getMemoryOpCost(
+      unsigned Opcode, Type *Src, Align Alignment, unsigned AddressSpace,
+      TTI::TargetCostKind CostKind,
+      TTI::OperandValueInfo OpInfo = {TTI::OK_AnyValue, TTI::OP_None},
+      const Instruction *I = nullptr);
+
+  /// When counting parts on AMD GPUs, account for i8s being grouped
+  /// together under a single i32 value. Otherwise fall back to base
+  /// implementation.
+  unsigned getNumberOfParts(Type *Tp);
 };
 
 } // end namespace llvm
 
@@ -126,24 +126,24 @@ define amdgpu_kernel void @add_i16() #0 {
 define amdgpu_kernel void @add_i8() #0 {
 ; ALL-LABEL: 'add_i8'
 ; ALL-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %i8 = add i8 undef, undef
-; ALL-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v2i8 = add <2 x i8> undef, undef
-; ALL-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v3i8 = add <3 x i8> undef, undef
-; ALL-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v4i8 = add <4 x i8> undef, undef
-; ALL-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v5i8 = add <5 x i8> undef, undef
-; ALL-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v6i8 = add <6 x i8> undef, undef
-; ALL-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %v32i8 = add <32 x i8> undef, undef
-; ALL-NEXT:  Cost Model: Found an estimated cost of 66 for instruction: %v33i8 = add <33 x i8> undef, undef
+; ALL-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v2i8 = add <2 x i8> undef, undef
+; ALL-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v3i8 = add <3 x i8> undef, undef
+; ALL-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v4i8 = add <4 x i8> undef, undef
+; ALL-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v5i8 = add <5 x i8> undef, undef
+; ALL-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v6i8 = add <6 x i8> undef, undef
+; ALL-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v32i8 = add <32 x i8> undef, undef
+; ALL-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %v33i8 = add <33 x i8> undef, undef
 ; ALL-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: ret void
 ;
 ; ALL-SIZE-LABEL: 'add_i8'
 ; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %i8 = add i8 undef, undef
-; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v2i8 = add <2 x i8> undef, undef
-; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v3i8 = add <3 x i8> undef, undef
-; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v4i8 = add <4 x i8> undef, undef
-; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v5i8 = add <5 x i8> undef, undef
-; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v6i8 = add <6 x i8> undef, undef
-; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %v32i8 = add <32 x i8> undef, undef
-; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 66 for instruction: %v33i8 = add <33 x i8> undef, undef
+; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v2i8 = add <2 x i8> undef, undef
+; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v3i8 = add <3 x i8> undef, undef
+; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v4i8 = add <4 x i8> undef, undef
+; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v5i8 = add <5 x i8> undef, undef
+; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v6i8 = add <6 x i8> undef, undef
+; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v32i8 = add <32 x i8> undef, undef
+; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %v33i8 = add <33 x i8> undef, undef
 ; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
 ;
   %i8 = add i8 undef, undef