Skip to content

Commit c8ab418

Browse files
committed
Enable vectorization of i8 values.
1 parent 2db0289 commit c8ab418

File tree

6 files changed

+448
-674
lines changed

6 files changed

+448
-674
lines changed

llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp

Lines changed: 44 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -344,9 +344,13 @@ unsigned GCNTTIImpl::getMinVectorRegisterBitWidth() const {
344344
unsigned GCNTTIImpl::getMaximumVF(unsigned ElemWidth, unsigned Opcode) const {
345345
if (Opcode == Instruction::Load || Opcode == Instruction::Store)
346346
return 32 * 4 / ElemWidth;
347-
return (ElemWidth == 16 && ST->has16BitInsts()) ? 2
348-
: (ElemWidth == 32 && ST->hasPackedFP32Ops()) ? 2
349-
: 1;
347+
// For a given width return the max 0number of elements that can be combined
348+
// into a wider bit value:
349+
bool isGFX8Plus = ST->getGeneration() >= AMDGPUSubtarget::VOLCANIC_ISLANDS;
350+
return (ElemWidth == 8 && isGFX8Plus) ? 4
351+
: (ElemWidth == 16 && ST->has16BitInsts()) ? 2
352+
: (ElemWidth == 32 && ST->hasPackedFP32Ops()) ? 2
353+
: 1;
350354
}
351355

352356
unsigned GCNTTIImpl::getLoadVectorFactor(unsigned VF, unsigned LoadSize,
@@ -1195,14 +1199,15 @@ InstructionCost GCNTTIImpl::getShuffleCost(TTI::ShuffleKind Kind,
11951199

11961200
Kind = improveShuffleKindFromMask(Kind, Mask, SrcTy, Index, SubTp);
11971201

1198-
// Larger vector widths may require additional instructions, but are
1199-
// typically cheaper than scalarized versions.
1200-
unsigned NumVectorElts = cast<FixedVectorType>(SrcTy)->getNumElements();
1202+
unsigned ScalarSize = DL.getTypeSizeInBits(SrcTy->getElementType());
12011203
if (ST->getGeneration() >= AMDGPUSubtarget::VOLCANIC_ISLANDS &&
1202-
DL.getTypeSizeInBits(SrcTy->getElementType()) == 16) {
1203-
bool HasVOP3P = ST->hasVOP3PInsts();
1204+
(ScalarSize == 16 || ScalarSize == 8)) {
1205+
// Larger vector widths may require additional instructions, but are
1206+
// typically cheaper than scalarized versions.
1207+
unsigned NumVectorElts = cast<FixedVectorType>(SrcTy)->getNumElements();
12041208
unsigned RequestedElts =
12051209
count_if(Mask, [](int MaskElt) { return MaskElt != -1; });
1210+
unsigned EltsPerReg = 32 / ScalarSize;
12061211
if (RequestedElts == 0)
12071212
return 0;
12081213
switch (Kind) {
@@ -1211,9 +1216,9 @@ InstructionCost GCNTTIImpl::getShuffleCost(TTI::ShuffleKind Kind,
12111216
case TTI::SK_PermuteSingleSrc: {
12121217
// With op_sel VOP3P instructions freely can access the low half or high
12131218
// half of a register, so any swizzle of two elements is free.
1214-
if (HasVOP3P && NumVectorElts == 2)
1219+
if (ST->hasVOP3PInsts() && ScalarSize == 16 && NumVectorElts == 2)
12151220
return 0;
1216-
unsigned NumPerms = alignTo(RequestedElts, 2) / 2;
1221+
unsigned NumPerms = alignTo(RequestedElts, EltsPerReg) / EltsPerReg;
12171222
// SK_Broadcast just reuses the same mask
12181223
unsigned NumPermMasks = Kind == TTI::SK_Broadcast ? 1 : NumPerms;
12191224
return NumPerms + NumPermMasks;
@@ -1225,12 +1230,12 @@ InstructionCost GCNTTIImpl::getShuffleCost(TTI::ShuffleKind Kind,
12251230
return 0;
12261231
// Insert/extract subvectors only require shifts / extract code to get the
12271232
// relevant bits
1228-
return alignTo(RequestedElts, 2) / 2;
1233+
return alignTo(RequestedElts, EltsPerReg) / EltsPerReg;
12291234
}
12301235
case TTI::SK_PermuteTwoSrc:
12311236
case TTI::SK_Splice:
12321237
case TTI::SK_Select: {
1233-
unsigned NumPerms = alignTo(RequestedElts, 2) / 2;
1238+
unsigned NumPerms = alignTo(RequestedElts, EltsPerReg) / EltsPerReg;
12341239
// SK_Select just reuses the same mask
12351240
unsigned NumPermMasks = Kind == TTI::SK_Select ? 1 : NumPerms;
12361241
return NumPerms + NumPermMasks;
@@ -1505,3 +1510,30 @@ GCNTTIImpl::fpenvIEEEMode(const Instruction &I) const {
15051510
return AMDGPU::isShader(F->getCallingConv()) ? KnownIEEEMode::Off
15061511
: KnownIEEEMode::On;
15071512
}
1513+
1514+
InstructionCost GCNTTIImpl::getMemoryOpCost(unsigned Opcode, Type *Src,
1515+
Align Alignment,
1516+
unsigned AddressSpace,
1517+
TTI::TargetCostKind CostKind,
1518+
TTI::OperandValueInfo OpInfo,
1519+
const Instruction *I) const {
1520+
if (VectorType *VecTy = dyn_cast<VectorType>(Src)) {
1521+
if ((Opcode == Instruction::Load || Opcode == Instruction::Store) &&
1522+
VecTy->getElementType()->isIntegerTy(8)) {
1523+
return divideCeil(DL.getTypeSizeInBits(VecTy) - 1,
1524+
getLoadStoreVecRegBitWidth(AddressSpace));
1525+
}
1526+
}
1527+
return BaseT::getMemoryOpCost(Opcode, Src, Alignment, AddressSpace, CostKind,
1528+
OpInfo, I);
1529+
}
1530+
1531+
unsigned GCNTTIImpl::getNumberOfParts(Type *Tp) const {
1532+
if (VectorType *VecTy = dyn_cast<VectorType>(Tp)) {
1533+
if (VecTy->getElementType()->isIntegerTy(8)) {
1534+
unsigned ElementCount = VecTy->getElementCount().getFixedValue();
1535+
return divideCeil(ElementCount - 1, 4);
1536+
}
1537+
}
1538+
return BaseT::getNumberOfParts(Tp);
1539+
}

llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.h

Lines changed: 14 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -288,6 +288,20 @@ class GCNTTIImpl final : public BasicTTIImplBase<GCNTTIImpl> {
288288
/// "amdgpu-ieee"="true" and KnownIEEEMode::Off if we can assume
289289
/// "amdgpu-ieee"="false".
290290
KnownIEEEMode fpenvIEEEMode(const Instruction &I) const;
291+
292+
/// Account for loads of i8 vector types to have reduced cost. For
293+
/// example the cost of load 4 i8s values is one is the cost of loading
294+
/// a single i32 value.
295+
InstructionCost getMemoryOpCost(
296+
unsigned Opcode, Type *Src, Align Alignment, unsigned AddressSpace,
297+
TTI::TargetCostKind CostKind,
298+
TTI::OperandValueInfo OpInfo = {TTI::OK_AnyValue, TTI::OP_None},
299+
const Instruction *I = nullptr) const override;
300+
301+
/// When counting parts on AMD GPUs, account for i8s being grouped
302+
/// together under a single i32 value. Otherwise fall back to base
303+
/// implementation.
304+
unsigned getNumberOfParts(Type *Tp) const override;
291305
};
292306

293307
} // end namespace llvm

0 commit comments

Comments
 (0)