Skip to content

Commit 9a927fe

Browse files
committed
Enable vectorization of i8 values.
1 parent 661f90a commit 9a927fe

File tree

10 files changed

+447
-307
lines changed

10 files changed

+447
-307
lines changed

llvm/include/llvm/Analysis/TargetTransformInfo.h

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1907,6 +1907,10 @@ class TargetTransformInfo {
19071907
/// pad to. Default is no padding.
19081908
unsigned getNumBytesToPadGlobalArray(unsigned Size, Type *ArrayType) const;
19091909

1910+
/// \return Returns true if vectorizing 4 x i8s into an i32 is possible.
1911+
/// Currently only used by the SLP vectorizer.
1912+
bool canVectorizei8s() const;
1913+
19101914
/// @}
19111915

19121916
/// Collect kernel launch bounds for \p F into \p LB.
@@ -2363,6 +2367,7 @@ class TargetTransformInfo::Concept {
23632367
virtual void collectKernelLaunchBounds(
23642368
const Function &F,
23652369
SmallVectorImpl<std::pair<StringRef, int64_t>> &LB) const = 0;
2370+
virtual bool canVectorizei8s() const = 0;
23662371
};
23672372

23682373
template <typename T>
@@ -3229,6 +3234,8 @@ class TargetTransformInfo::Model final : public TargetTransformInfo::Concept {
32293234
SmallVectorImpl<std::pair<StringRef, int64_t>> &LB) const override {
32303235
Impl.collectKernelLaunchBounds(F, LB);
32313236
}
3237+
3238+
bool canVectorizei8s() const override { return Impl.canVectorizei8s(); }
32323239
};
32333240

32343241
template <typename T>

llvm/include/llvm/Analysis/TargetTransformInfoImpl.h

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1063,6 +1063,8 @@ class TargetTransformInfoImplBase {
10631063

10641064
unsigned getMaxNumArgs() const { return UINT_MAX; }
10651065

1066+
bool canVectorizei8s() const { return false; }
1067+
10661068
unsigned getNumBytesToPadGlobalArray(unsigned Size, Type *ArrayType) const {
10671069
return 0;
10681070
}

llvm/lib/Analysis/TargetTransformInfo.cpp

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1419,6 +1419,10 @@ unsigned TargetTransformInfo::getMaxNumArgs() const {
14191419
return TTIImpl->getMaxNumArgs();
14201420
}
14211421

1422+
bool TargetTransformInfo::canVectorizei8s() const {
1423+
return TTIImpl->canVectorizei8s();
1424+
}
1425+
14221426
bool TargetTransformInfo::shouldExpandReduction(const IntrinsicInst *II) const {
14231427
return TTIImpl->shouldExpandReduction(II);
14241428
}

llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp

Lines changed: 22 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -344,9 +344,10 @@ unsigned GCNTTIImpl::getMinVectorRegisterBitWidth() const {
344344
unsigned GCNTTIImpl::getMaximumVF(unsigned ElemWidth, unsigned Opcode) const {
345345
if (Opcode == Instruction::Load || Opcode == Instruction::Store)
346346
return 32 * 4 / ElemWidth;
347-
return (ElemWidth == 16 && ST->has16BitInsts()) ? 2
348-
: (ElemWidth == 32 && ST->hasPackedFP32Ops()) ? 2
349-
: 1;
347+
return ElemWidth == 8 ? 4
348+
: (ElemWidth == 16 && ST->has16BitInsts()) ? 2
349+
: (ElemWidth == 32 && ST->hasPackedFP32Ops()) ? 2
350+
: 1;
350351
}
351352

352353
unsigned GCNTTIImpl::getLoadVectorFactor(unsigned VF, unsigned LoadSize,
@@ -1120,6 +1121,17 @@ Value *GCNTTIImpl::rewriteIntrinsicWithAddressSpace(IntrinsicInst *II,
11201121
}
11211122
}
11221123

1124+
InstructionCost GCNTTIImpl::getScalarizationOverhead(
1125+
VectorType *InTy, const APInt &DemandedElts, bool Insert, bool Extract,
1126+
TTI::TargetCostKind CostKind, ArrayRef<Value *> VL) {
1127+
unsigned NumVectorElts = cast<FixedVectorType>(InTy)->getNumElements();
1128+
if (NumVectorElts > 1 &&
1129+
InTy->getElementType() == IntegerType::getInt8Ty(InTy->getContext()))
1130+
return 0;
1131+
return BaseT::getScalarizationOverhead(InTy, DemandedElts, Insert, Extract,
1132+
CostKind, VL);
1133+
}
1134+
11231135
InstructionCost GCNTTIImpl::getShuffleCost(TTI::ShuffleKind Kind,
11241136
VectorType *VT, ArrayRef<int> Mask,
11251137
TTI::TargetCostKind CostKind,
@@ -1134,6 +1146,11 @@ InstructionCost GCNTTIImpl::getShuffleCost(TTI::ShuffleKind Kind,
11341146
// Larger vector widths may require additional instructions, but are
11351147
// typically cheaper than scalarized versions.
11361148
unsigned NumVectorElts = cast<FixedVectorType>(VT)->getNumElements();
1149+
1150+
if (NumVectorElts > 1 &&
1151+
VT->getElementType() == IntegerType::getInt8Ty(VT->getContext()))
1152+
return 0;
1153+
11371154
if (ST->getGeneration() >= AMDGPUSubtarget::VOLCANIC_ISLANDS &&
11381155
DL.getTypeSizeInBits(VT->getElementType()) == 16) {
11391156
bool HasVOP3P = ST->hasVOP3PInsts();
@@ -1423,3 +1440,5 @@ void GCNTTIImpl::collectKernelLaunchBounds(
14231440
LB.push_back({"amdgpu-waves-per-eu[0]", WavesPerEU.first});
14241441
LB.push_back({"amdgpu-waves-per-eu[1]", WavesPerEU.second});
14251442
}
1443+
1444+
bool GCNTTIImpl::canVectorizei8s() const { return true; }

llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.h

Lines changed: 9 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -240,6 +240,12 @@ class GCNTTIImpl final : public BasicTTIImplBase<GCNTTIImpl> {
240240

241241
InstructionCost getVectorSplitCost() { return 0; }
242242

243+
InstructionCost getScalarizationOverhead(VectorType *InTy,
244+
const APInt &DemandedElts,
245+
bool Insert, bool Extract,
246+
TTI::TargetCostKind CostKind,
247+
ArrayRef<Value *> VL = {});
248+
243249
InstructionCost getShuffleCost(TTI::ShuffleKind Kind, VectorType *Tp,
244250
ArrayRef<int> Mask,
245251
TTI::TargetCostKind CostKind, int Index,
@@ -282,6 +288,9 @@ class GCNTTIImpl final : public BasicTTIImplBase<GCNTTIImpl> {
282288
void collectKernelLaunchBounds(
283289
const Function &F,
284290
SmallVectorImpl<std::pair<StringRef, int64_t>> &LB) const;
291+
292+
/// \return return true if we can pack 4 i8s into an i32.
293+
bool canVectorizei8s() const;
285294
};
286295

287296
} // end namespace llvm

llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp

Lines changed: 15 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -12971,9 +12971,16 @@ BoUpSLP::getEntryCost(const TreeEntry *E, ArrayRef<Value *> VectorizedVals,
1297112971
LI0->getPointerAddressSpace(), CostKind);
1297212972

1297312973
} else {
12974-
VecLdCost = TTI->getMemoryOpCost(
12975-
Instruction::Load, VecTy, LI0->getAlign(),
12976-
LI0->getPointerAddressSpace(), CostKind, TTI::OperandValueInfo());
12974+
if (VecTy->getElementType() ==
12975+
IntegerType::getInt8Ty(VecTy->getContext()) &&
12976+
TTI->canVectorizei8s()) {
12977+
VecLdCost = 1;
12978+
} else {
12979+
VecLdCost =
12980+
TTI->getMemoryOpCost(Instruction::Load, VecTy, LI0->getAlign(),
12981+
LI0->getPointerAddressSpace(), CostKind,
12982+
TTI::OperandValueInfo());
12983+
}
1297712984
}
1297812985
break;
1297912986
case TreeEntry::StridedVectorize: {
@@ -20927,7 +20934,11 @@ bool SLPVectorizerPass::tryToVectorizeList(ArrayRef<Value *> VL, BoUpSLP &R,
2092720934
// provided vectorization factor (i.e. the scalar type is used for vector
2092820935
// code during codegen).
2092920936
auto *VecTy = getWidenedType(ScalarTy, VF);
20930-
if (TTI->getNumberOfParts(VecTy) == VF)
20937+
unsigned NumParts = TTI->getNumberOfParts(VecTy);
20938+
if (TTI->canVectorizei8s() &&
20939+
VecTy->getElementType() == IntegerType::getInt8Ty(VecTy->getContext()))
20940+
NumParts = 1;
20941+
if (NumParts == VF)
2093120942
continue;
2093220943
for (unsigned I = NextInst; I < MaxInst; ++I) {
2093320944
unsigned ActualVF = std::min(MaxInst - I, VF);

0 commit comments

Comments
 (0)