Skip to content

Commit f788a42

Browse files
committed
Enable vectorization of i8 values.
1 parent beffd15 commit f788a42

File tree

6 files changed

+448
-674
lines changed

6 files changed

+448
-674
lines changed

llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp

Lines changed: 44 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -344,9 +344,13 @@ unsigned GCNTTIImpl::getMinVectorRegisterBitWidth() const {
344344
unsigned GCNTTIImpl::getMaximumVF(unsigned ElemWidth, unsigned Opcode) const {
345345
if (Opcode == Instruction::Load || Opcode == Instruction::Store)
346346
return 32 * 4 / ElemWidth;
347-
return (ElemWidth == 16 && ST->has16BitInsts()) ? 2
348-
: (ElemWidth == 32 && ST->hasPackedFP32Ops()) ? 2
349-
: 1;
347+
// For a given width return the max 0number of elements that can be combined
348+
// into a wider bit value:
349+
bool isGFX8Plus = ST->getGeneration() >= AMDGPUSubtarget::VOLCANIC_ISLANDS;
350+
return (ElemWidth == 8 && isGFX8Plus) ? 4
351+
: (ElemWidth == 16 && ST->has16BitInsts()) ? 2
352+
: (ElemWidth == 32 && ST->hasPackedFP32Ops()) ? 2
353+
: 1;
350354
}
351355

352356
unsigned GCNTTIImpl::getLoadVectorFactor(unsigned VF, unsigned LoadSize,
@@ -1151,14 +1155,15 @@ InstructionCost GCNTTIImpl::getShuffleCost(TTI::ShuffleKind Kind,
11511155

11521156
Kind = improveShuffleKindFromMask(Kind, Mask, VT, Index, SubTp);
11531157

1154-
// Larger vector widths may require additional instructions, but are
1155-
// typically cheaper than scalarized versions.
1156-
unsigned NumVectorElts = cast<FixedVectorType>(VT)->getNumElements();
1158+
unsigned ScalarSize = DL.getTypeSizeInBits(VT->getElementType());
11571159
if (ST->getGeneration() >= AMDGPUSubtarget::VOLCANIC_ISLANDS &&
1158-
DL.getTypeSizeInBits(VT->getElementType()) == 16) {
1159-
bool HasVOP3P = ST->hasVOP3PInsts();
1160+
(ScalarSize == 16 || ScalarSize == 8)) {
1161+
// Larger vector widths may require additional instructions, but are
1162+
// typically cheaper than scalarized versions.
1163+
unsigned NumVectorElts = cast<FixedVectorType>(VT)->getNumElements();
11601164
unsigned RequestedElts =
11611165
count_if(Mask, [](int MaskElt) { return MaskElt != -1; });
1166+
unsigned EltsPerReg = 32 / ScalarSize;
11621167
if (RequestedElts == 0)
11631168
return 0;
11641169
switch (Kind) {
@@ -1167,9 +1172,9 @@ InstructionCost GCNTTIImpl::getShuffleCost(TTI::ShuffleKind Kind,
11671172
case TTI::SK_PermuteSingleSrc: {
11681173
// With op_sel VOP3P instructions freely can access the low half or high
11691174
// half of a register, so any swizzle of two elements is free.
1170-
if (HasVOP3P && NumVectorElts == 2)
1175+
if (ST->hasVOP3PInsts() && ScalarSize == 16 && NumVectorElts == 2)
11711176
return 0;
1172-
unsigned NumPerms = alignTo(RequestedElts, 2) / 2;
1177+
unsigned NumPerms = alignTo(RequestedElts, EltsPerReg) / EltsPerReg;
11731178
// SK_Broadcast just reuses the same mask
11741179
unsigned NumPermMasks = Kind == TTI::SK_Broadcast ? 1 : NumPerms;
11751180
return NumPerms + NumPermMasks;
@@ -1181,12 +1186,12 @@ InstructionCost GCNTTIImpl::getShuffleCost(TTI::ShuffleKind Kind,
11811186
return 0;
11821187
// Insert/extract subvectors only require shifts / extract code to get the
11831188
// relevant bits
1184-
return alignTo(RequestedElts, 2) / 2;
1189+
return alignTo(RequestedElts, EltsPerReg) / EltsPerReg;
11851190
}
11861191
case TTI::SK_PermuteTwoSrc:
11871192
case TTI::SK_Splice:
11881193
case TTI::SK_Select: {
1189-
unsigned NumPerms = alignTo(RequestedElts, 2) / 2;
1194+
unsigned NumPerms = alignTo(RequestedElts, EltsPerReg) / EltsPerReg;
11901195
// SK_Select just reuses the same mask
11911196
unsigned NumPermMasks = Kind == TTI::SK_Select ? 1 : NumPerms;
11921197
return NumPerms + NumPermMasks;
@@ -1443,3 +1448,30 @@ void GCNTTIImpl::collectKernelLaunchBounds(
14431448
LB.push_back({"amdgpu-waves-per-eu[0]", WavesPerEU.first});
14441449
LB.push_back({"amdgpu-waves-per-eu[1]", WavesPerEU.second});
14451450
}
1451+
1452+
InstructionCost GCNTTIImpl::getMemoryOpCost(unsigned Opcode, Type *Src,
1453+
Align Alignment,
1454+
unsigned AddressSpace,
1455+
TTI::TargetCostKind CostKind,
1456+
TTI::OperandValueInfo OpInfo,
1457+
const Instruction *I) const {
1458+
if (VectorType *VecTy = dyn_cast<VectorType>(Src)) {
1459+
if ((Opcode == Instruction::Load || Opcode == Instruction::Store) &&
1460+
VecTy->getElementType()->isIntegerTy(8)) {
1461+
return divideCeil(DL.getTypeSizeInBits(VecTy) - 1,
1462+
getLoadStoreVecRegBitWidth(AddressSpace));
1463+
}
1464+
}
1465+
return BaseT::getMemoryOpCost(Opcode, Src, Alignment, AddressSpace, CostKind,
1466+
OpInfo, I);
1467+
}
1468+
1469+
unsigned GCNTTIImpl::getNumberOfParts(Type *Tp) const {
1470+
if (VectorType *VecTy = dyn_cast<VectorType>(Tp)) {
1471+
if (VecTy->getElementType()->isIntegerTy(8)) {
1472+
unsigned ElementCount = VecTy->getElementCount().getFixedValue();
1473+
return divideCeil(ElementCount - 1, 4);
1474+
}
1475+
}
1476+
return BaseT::getNumberOfParts(Tp);
1477+
}

llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.h

Lines changed: 14 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -281,6 +281,20 @@ class GCNTTIImpl final : public BasicTTIImplBase<GCNTTIImpl> {
281281
void collectKernelLaunchBounds(
282282
const Function &F,
283283
SmallVectorImpl<std::pair<StringRef, int64_t>> &LB) const override;
284+
285+
/// Account for loads of i8 vector types to have reduced cost. For
286+
/// example the cost of load 4 i8s values is one is the cost of loading
287+
/// a single i32 value.
288+
InstructionCost getMemoryOpCost(
289+
unsigned Opcode, Type *Src, Align Alignment, unsigned AddressSpace,
290+
TTI::TargetCostKind CostKind,
291+
TTI::OperandValueInfo OpInfo = {TTI::OK_AnyValue, TTI::OP_None},
292+
const Instruction *I = nullptr) const override;
293+
294+
/// When counting parts on AMD GPUs, account for i8s being grouped
295+
/// together under a single i32 value. Otherwise fall back to base
296+
/// implementation.
297+
unsigned getNumberOfParts(Type *Tp) const override;
284298
};
285299

286300
} // end namespace llvm

0 commit comments

Comments
 (0)