Skip to content

Commit cc100d8

Browse files
committed
Enable vectorization of i8 values.
1 parent 661f90a commit cc100d8

File tree

8 files changed

+440
-194
lines changed

8 files changed

+440
-194
lines changed

llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp

Lines changed: 35 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -344,9 +344,10 @@ unsigned GCNTTIImpl::getMinVectorRegisterBitWidth() const {
344344
unsigned GCNTTIImpl::getMaximumVF(unsigned ElemWidth, unsigned Opcode) const {
345345
if (Opcode == Instruction::Load || Opcode == Instruction::Store)
346346
return 32 * 4 / ElemWidth;
347-
return (ElemWidth == 16 && ST->has16BitInsts()) ? 2
348-
: (ElemWidth == 32 && ST->hasPackedFP32Ops()) ? 2
349-
: 1;
347+
return ElemWidth == 8 ? 4
348+
: (ElemWidth == 16 && ST->has16BitInsts()) ? 2
349+
: (ElemWidth == 32 && ST->hasPackedFP32Ops()) ? 2
350+
: 1;
350351
}
351352

352353
unsigned GCNTTIImpl::getLoadVectorFactor(unsigned VF, unsigned LoadSize,
@@ -537,6 +538,12 @@ InstructionCost GCNTTIImpl::getArithmeticInstrCost(
537538

538539
MVT::SimpleValueType SLT = LT.second.getScalarType().SimpleTy;
539540

541+
VectorType *VecTy = dyn_cast<VectorType>(Ty);
542+
InstructionCost LTTypeCost = LT.first;
543+
if (VecTy &&
544+
VecTy->getElementType() == IntegerType::getInt8Ty(VecTy->getContext()))
545+
LTTypeCost = (((LT.first - 1) / 4) + 1);
546+
540547
switch (ISD) {
541548
case ISD::SHL:
542549
case ISD::SRL:
@@ -548,7 +555,7 @@ InstructionCost GCNTTIImpl::getArithmeticInstrCost(
548555
NElts = (NElts + 1) / 2;
549556

550557
// i32
551-
return getFullRateInstrCost() * LT.first * NElts;
558+
return getFullRateInstrCost() * LTTypeCost * NElts;
552559
case ISD::ADD:
553560
case ISD::SUB:
554561
case ISD::AND:
@@ -562,7 +569,7 @@ InstructionCost GCNTTIImpl::getArithmeticInstrCost(
562569
if (ST->has16BitInsts() && SLT == MVT::i16)
563570
NElts = (NElts + 1) / 2;
564571

565-
return LT.first * NElts * getFullRateInstrCost();
572+
return LTTypeCost * NElts * getFullRateInstrCost();
566573
case ISD::MUL: {
567574
const int QuarterRateCost = getQuarterRateInstrCost(CostKind);
568575
if (SLT == MVT::i64) {
@@ -574,7 +581,7 @@ InstructionCost GCNTTIImpl::getArithmeticInstrCost(
574581
NElts = (NElts + 1) / 2;
575582

576583
// i32
577-
return QuarterRateCost * NElts * LT.first;
584+
return QuarterRateCost * NElts * LTTypeCost;
578585
}
579586
case ISD::FMUL:
580587
// Check possible fuse {fadd|fsub}(a,fmul(b,c)) and return zero cost for
@@ -1423,3 +1430,25 @@ void GCNTTIImpl::collectKernelLaunchBounds(
14231430
LB.push_back({"amdgpu-waves-per-eu[0]", WavesPerEU.first});
14241431
LB.push_back({"amdgpu-waves-per-eu[1]", WavesPerEU.second});
14251432
}
1433+
1434+
InstructionCost GCNTTIImpl::getMemoryOpCost(unsigned Opcode, Type *Src,
1435+
Align Alignment,
1436+
unsigned AddressSpace,
1437+
TTI::TargetCostKind CostKind,
1438+
TTI::OperandValueInfo OpInfo,
1439+
const Instruction *I) {
1440+
VectorType *VecTy = dyn_cast<VectorType>(Src);
1441+
if (VecTy && Opcode == Instruction::Load &&
1442+
VecTy->getElementType() == IntegerType::getInt8Ty(VecTy->getContext()))
1443+
return 1;
1444+
return BaseT::getMemoryOpCost(Opcode, Src, Alignment, AddressSpace, CostKind,
1445+
OpInfo, I);
1446+
}
1447+
1448+
unsigned GCNTTIImpl::getNumberOfParts(Type *Tp) {
1449+
VectorType *VecTy = dyn_cast<VectorType>(Tp);
1450+
if (VecTy &&
1451+
VecTy->getElementType() == IntegerType::getInt8Ty(VecTy->getContext()))
1452+
return 1;
1453+
return BaseT::getNumberOfParts(Tp);
1454+
}

llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.h

Lines changed: 10 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -282,6 +282,16 @@ class GCNTTIImpl final : public BasicTTIImplBase<GCNTTIImpl> {
282282
void collectKernelLaunchBounds(
283283
const Function &F,
284284
SmallVectorImpl<std::pair<StringRef, int64_t>> &LB) const;
285+
286+
/// \return The cost of Load and Store instructions.
287+
InstructionCost getMemoryOpCost(
288+
unsigned Opcode, Type *Src, Align Alignment, unsigned AddressSpace,
289+
TTI::TargetCostKind CostKind,
290+
TTI::OperandValueInfo OpInfo = {TTI::OK_AnyValue, TTI::OP_None},
291+
const Instruction *I = nullptr);
292+
293+
/// \return numer of parts in this type.
294+
unsigned getNumberOfParts(Type *Tp);
285295
};
286296

287297
} // end namespace llvm

llvm/test/Analysis/CostModel/AMDGPU/add-sub.ll

Lines changed: 14 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -126,24 +126,24 @@ define amdgpu_kernel void @add_i16() #0 {
126126
define amdgpu_kernel void @add_i8() #0 {
127127
; ALL-LABEL: 'add_i8'
128128
; ALL-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %i8 = add i8 undef, undef
129-
; ALL-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v2i8 = add <2 x i8> undef, undef
130-
; ALL-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %v3i8 = add <3 x i8> undef, undef
131-
; ALL-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %v4i8 = add <4 x i8> undef, undef
132-
; ALL-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %v5i8 = add <5 x i8> undef, undef
133-
; ALL-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %v6i8 = add <6 x i8> undef, undef
134-
; ALL-NEXT: Cost Model: Found an estimated cost of 32 for instruction: %v32i8 = add <32 x i8> undef, undef
135-
; ALL-NEXT: Cost Model: Found an estimated cost of 66 for instruction: %v33i8 = add <33 x i8> undef, undef
129+
; ALL-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v2i8 = add <2 x i8> undef, undef
130+
; ALL-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v3i8 = add <3 x i8> undef, undef
131+
; ALL-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v4i8 = add <4 x i8> undef, undef
132+
; ALL-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v5i8 = add <5 x i8> undef, undef
133+
; ALL-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v6i8 = add <6 x i8> undef, undef
134+
; ALL-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %v32i8 = add <32 x i8> undef, undef
135+
; ALL-NEXT: Cost Model: Found an estimated cost of 17 for instruction: %v33i8 = add <33 x i8> undef, undef
136136
; ALL-NEXT: Cost Model: Found an estimated cost of 10 for instruction: ret void
137137
;
138138
; ALL-SIZE-LABEL: 'add_i8'
139139
; ALL-SIZE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %i8 = add i8 undef, undef
140-
; ALL-SIZE-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v2i8 = add <2 x i8> undef, undef
141-
; ALL-SIZE-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %v3i8 = add <3 x i8> undef, undef
142-
; ALL-SIZE-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %v4i8 = add <4 x i8> undef, undef
143-
; ALL-SIZE-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %v5i8 = add <5 x i8> undef, undef
144-
; ALL-SIZE-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %v6i8 = add <6 x i8> undef, undef
145-
; ALL-SIZE-NEXT: Cost Model: Found an estimated cost of 32 for instruction: %v32i8 = add <32 x i8> undef, undef
146-
; ALL-SIZE-NEXT: Cost Model: Found an estimated cost of 66 for instruction: %v33i8 = add <33 x i8> undef, undef
140+
; ALL-SIZE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v2i8 = add <2 x i8> undef, undef
141+
; ALL-SIZE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v3i8 = add <3 x i8> undef, undef
142+
; ALL-SIZE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v4i8 = add <4 x i8> undef, undef
143+
; ALL-SIZE-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v5i8 = add <5 x i8> undef, undef
144+
; ALL-SIZE-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v6i8 = add <6 x i8> undef, undef
145+
; ALL-SIZE-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %v32i8 = add <32 x i8> undef, undef
146+
; ALL-SIZE-NEXT: Cost Model: Found an estimated cost of 17 for instruction: %v33i8 = add <33 x i8> undef, undef
147147
; ALL-SIZE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret void
148148
;
149149
%i8 = add i8 undef, undef

0 commit comments

Comments
 (0)