Skip to content

Commit fa5860c

Browse files
jrbyrnesbcahoon
authored andcommitted
[AMDGPU] Vectorize i8 Shuffles
Change-Id: I1078d3a259708558b5c9a39641743e40605041c9 (cherry picked from commit 0912559)
1 parent 0ed7214 commit fa5860c

File tree

8 files changed

+1001
-104
lines changed

8 files changed

+1001
-104
lines changed

llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp

Lines changed: 32 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -306,6 +306,23 @@ bool GCNTTIImpl::hasBranchDivergence(const Function *F) const {
306306
return !F || !ST->isSingleLaneExecution(*F);
307307
}
308308

309+
unsigned GCNTTIImpl::getNumberOfParts(Type *Tp) {
310+
// For certain 8 bit ops, we can pack a v4i8 into a single part
311+
// (e.g. v4i8 shufflevectors -> v_perm v4i8, v4i8). Thus, we
312+
// do not limit the numberOfParts for 8 bit vectors to the
313+
// legalization costs of such. It is left up to other target
314+
// queries (e.g. get*InstrCost) to decide the proper handling
315+
// of 8 bit vectors.
316+
if (FixedVectorType *VTy = dyn_cast<FixedVectorType>(Tp)) {
317+
if (DL.getTypeSizeInBits(VTy->getElementType()) == 8) {
318+
unsigned ElCount = VTy->getElementCount().getFixedValue();
319+
return PowerOf2Ceil(ElCount / 4);
320+
}
321+
}
322+
323+
return BaseT::getNumberOfParts(Tp);
324+
}
325+
309326
unsigned GCNTTIImpl::getNumberOfRegisters(unsigned RCID) const {
310327
// NB: RCID is not an RCID. In fact it is 0 or 1 for scalar or vector
311328
// registers. See getRegisterClassForType for the implementation.
@@ -337,9 +354,11 @@ unsigned GCNTTIImpl::getMinVectorRegisterBitWidth() const {
337354
unsigned GCNTTIImpl::getMaximumVF(unsigned ElemWidth, unsigned Opcode) const {
338355
if (Opcode == Instruction::Load || Opcode == Instruction::Store)
339356
return 32 * 4 / ElemWidth;
340-
return (ElemWidth == 16 && ST->has16BitInsts()) ? 2
341-
: (ElemWidth == 32 && ST->hasPackedFP32Ops()) ? 2
342-
: 1;
357+
358+
return (ElemWidth == 8) ? 4
359+
: (ElemWidth == 16) ? 2
360+
: (ElemWidth == 32 && ST->hasPackedFP32Ops()) ? 2
361+
: 1;
343362
}
344363

345364
unsigned GCNTTIImpl::getLoadVectorFactor(unsigned VF, unsigned LoadSize,
@@ -1133,14 +1152,15 @@ InstructionCost GCNTTIImpl::getShuffleCost(TTI::ShuffleKind Kind,
11331152

11341153
Kind = improveShuffleKindFromMask(Kind, Mask, VT, Index, SubTp);
11351154

1136-
// Larger vector widths may require additional instructions, but are
1137-
// typically cheaper than scalarized versions.
1138-
unsigned NumVectorElts = cast<FixedVectorType>(VT)->getNumElements();
1155+
unsigned ScalarSize = DL.getTypeSizeInBits(VT->getElementType());
11391156
if (ST->getGeneration() >= AMDGPUSubtarget::VOLCANIC_ISLANDS &&
1140-
DL.getTypeSizeInBits(VT->getElementType()) == 16) {
1141-
bool HasVOP3P = ST->hasVOP3PInsts();
1157+
(ScalarSize == 16 || ScalarSize == 8)) {
1158+
// Larger vector widths may require additional instructions, but are
1159+
// typically cheaper than scalarized versions.
1160+
unsigned NumVectorElts = cast<FixedVectorType>(VT)->getNumElements();
11421161
unsigned RequestedElts =
11431162
count_if(Mask, [](int MaskElt) { return MaskElt != -1; });
1163+
unsigned EltsPerReg = 32 / ScalarSize;
11441164
if (RequestedElts == 0)
11451165
return 0;
11461166
switch (Kind) {
@@ -1149,9 +1169,9 @@ InstructionCost GCNTTIImpl::getShuffleCost(TTI::ShuffleKind Kind,
11491169
case TTI::SK_PermuteSingleSrc: {
11501170
// With op_sel VOP3P instructions freely can access the low half or high
11511171
// half of a register, so any swizzle of two elements is free.
1152-
if (HasVOP3P && NumVectorElts == 2)
1172+
if (ST->hasVOP3PInsts() && ScalarSize == 16 && NumVectorElts == 2)
11531173
return 0;
1154-
unsigned NumPerms = alignTo(RequestedElts, 2) / 2;
1174+
unsigned NumPerms = alignTo(RequestedElts, EltsPerReg) / EltsPerReg;
11551175
// SK_Broadcast just reuses the same mask
11561176
unsigned NumPermMasks = Kind == TTI::SK_Broadcast ? 1 : NumPerms;
11571177
return NumPerms + NumPermMasks;
@@ -1163,12 +1183,12 @@ InstructionCost GCNTTIImpl::getShuffleCost(TTI::ShuffleKind Kind,
11631183
return 0;
11641184
// Insert/extract subvectors only require shifts / extract code to get the
11651185
// relevant bits
1166-
return alignTo(RequestedElts, 2) / 2;
1186+
return alignTo(RequestedElts, EltsPerReg) / EltsPerReg;
11671187
}
11681188
case TTI::SK_PermuteTwoSrc:
11691189
case TTI::SK_Splice:
11701190
case TTI::SK_Select: {
1171-
unsigned NumPerms = alignTo(RequestedElts, 2) / 2;
1191+
unsigned NumPerms = alignTo(RequestedElts, EltsPerReg) / EltsPerReg;
11721192
// SK_Select just reuses the same mask
11731193
unsigned NumPermMasks = Kind == TTI::SK_Select ? 1 : NumPerms;
11741194
return NumPerms + NumPermMasks;

llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.h

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -117,6 +117,7 @@ class GCNTTIImpl final : public BasicTTIImplBase<GCNTTIImpl> {
117117
return TTI::PSK_FastHardware;
118118
}
119119

120+
unsigned getNumberOfParts(Type *Tp);
120121
unsigned getNumberOfRegisters(unsigned RCID) const;
121122
TypeSize getRegisterBitWidth(TargetTransformInfo::RegisterKind Vector) const;
122123
unsigned getMinVectorRegisterBitWidth() const;

0 commit comments

Comments
 (0)