Skip to content

Commit 712efb5

Browse files
authored
Merge pull request llvm#370 from AMD-Lightning-Internal/amd/dev/jebyrnes/swdev-446684-rebase0
[AMDGPU] Add off-by-default flag to control i8 vectorization
2 parents 3391c1a + 0c24767 commit 712efb5

File tree

11 files changed

+3031
-31
lines changed

11 files changed

+3031
-31
lines changed

llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp

Lines changed: 34 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -313,6 +313,24 @@ bool GCNTTIImpl::hasBranchDivergence(const Function *F) const {
313313
return !F || !ST->isSingleLaneExecution(*F);
314314
}
315315

316+
unsigned GCNTTIImpl::getNumberOfParts(Type *Tp) {
317+
// For certain 8 bit ops, we can pack a v4i8 into a single part
318+
// (e.g. v4i8 shufflevectors -> v_perm v4i8, v4i8). Thus, we
319+
// do not limit the numberOfParts for 8 bit vectors to the
320+
// legalization costs of such. It is left up to other target
321+
// queries (e.g. get*InstrCost) to decide the proper handling
322+
// of 8 bit vectors.
323+
if (FixedVectorType *VTy = dyn_cast<FixedVectorType>(Tp)) {
324+
if (ST->shouldCoerceIllegalTypes() &&
325+
DL.getTypeSizeInBits(VTy->getElementType()) == 8) {
326+
unsigned ElCount = VTy->getElementCount().getFixedValue();
327+
return PowerOf2Ceil(ElCount / 4);
328+
}
329+
}
330+
331+
return BaseT::getNumberOfParts(Tp);
332+
}
333+
316334
unsigned GCNTTIImpl::getNumberOfRegisters(unsigned RCID) const {
317335
// NB: RCID is not an RCID. In fact it is 0 or 1 for scalar or vector
318336
// registers. See getRegisterClassForType for the implementation.
@@ -344,9 +362,11 @@ unsigned GCNTTIImpl::getMinVectorRegisterBitWidth() const {
344362
unsigned GCNTTIImpl::getMaximumVF(unsigned ElemWidth, unsigned Opcode) const {
345363
if (Opcode == Instruction::Load || Opcode == Instruction::Store)
346364
return 32 * 4 / ElemWidth;
347-
return (ElemWidth == 16 && ST->has16BitInsts()) ? 2
348-
: (ElemWidth == 32 && ST->hasPackedFP32Ops()) ? 2
349-
: 1;
365+
366+
return (ST->shouldCoerceIllegalTypes() && ElemWidth == 8) ? 4
367+
: (ElemWidth == 16) ? 2
368+
: (ElemWidth == 32 && ST->hasPackedFP32Ops()) ? 2
369+
: 1;
350370
}
351371

352372
unsigned GCNTTIImpl::getLoadVectorFactor(unsigned VF, unsigned LoadSize,
@@ -1131,14 +1151,16 @@ InstructionCost GCNTTIImpl::getShuffleCost(TTI::ShuffleKind Kind,
11311151

11321152
Kind = improveShuffleKindFromMask(Kind, Mask, VT, Index, SubTp);
11331153

1134-
// Larger vector widths may require additional instructions, but are
1135-
// typically cheaper than scalarized versions.
1136-
unsigned NumVectorElts = cast<FixedVectorType>(VT)->getNumElements();
1154+
unsigned ScalarSize = DL.getTypeSizeInBits(VT->getElementType());
11371155
if (ST->getGeneration() >= AMDGPUSubtarget::VOLCANIC_ISLANDS &&
1138-
DL.getTypeSizeInBits(VT->getElementType()) == 16) {
1139-
bool HasVOP3P = ST->hasVOP3PInsts();
1156+
(ScalarSize == 16 ||
1157+
(ScalarSize == 8 && ST->shouldCoerceIllegalTypes()))) {
1158+
// Larger vector widths may require additional instructions, but are
1159+
// typically cheaper than scalarized versions.
1160+
unsigned NumVectorElts = cast<FixedVectorType>(VT)->getNumElements();
11401161
unsigned RequestedElts =
11411162
count_if(Mask, [](int MaskElt) { return MaskElt != -1; });
1163+
unsigned EltsPerReg = 32 / ScalarSize;
11421164
if (RequestedElts == 0)
11431165
return 0;
11441166
switch (Kind) {
@@ -1147,9 +1169,9 @@ InstructionCost GCNTTIImpl::getShuffleCost(TTI::ShuffleKind Kind,
11471169
case TTI::SK_PermuteSingleSrc: {
11481170
// With op_sel VOP3P instructions freely can access the low half or high
11491171
// half of a register, so any swizzle of two elements is free.
1150-
if (HasVOP3P && NumVectorElts == 2)
1172+
if (ST->hasVOP3PInsts() && ScalarSize == 16 && NumVectorElts == 2)
11511173
return 0;
1152-
unsigned NumPerms = alignTo(RequestedElts, 2) / 2;
1174+
unsigned NumPerms = alignTo(RequestedElts, EltsPerReg) / EltsPerReg;
11531175
// SK_Broadcast just reuses the same mask
11541176
unsigned NumPermMasks = Kind == TTI::SK_Broadcast ? 1 : NumPerms;
11551177
return NumPerms + NumPermMasks;
@@ -1161,12 +1183,12 @@ InstructionCost GCNTTIImpl::getShuffleCost(TTI::ShuffleKind Kind,
11611183
return 0;
11621184
// Insert/extract subvectors only require shifts / extract code to get the
11631185
// relevant bits
1164-
return alignTo(RequestedElts, 2) / 2;
1186+
return alignTo(RequestedElts, EltsPerReg) / EltsPerReg;
11651187
}
11661188
case TTI::SK_PermuteTwoSrc:
11671189
case TTI::SK_Splice:
11681190
case TTI::SK_Select: {
1169-
unsigned NumPerms = alignTo(RequestedElts, 2) / 2;
1191+
unsigned NumPerms = alignTo(RequestedElts, EltsPerReg) / EltsPerReg;
11701192
// SK_Select just reuses the same mask
11711193
unsigned NumPermMasks = Kind == TTI::SK_Select ? 1 : NumPerms;
11721194
return NumPerms + NumPermMasks;

llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.h

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -118,6 +118,7 @@ class GCNTTIImpl final : public BasicTTIImplBase<GCNTTIImpl> {
118118
return TTI::PSK_FastHardware;
119119
}
120120

121+
unsigned getNumberOfParts(Type *Tp);
121122
unsigned getNumberOfRegisters(unsigned RCID) const;
122123
TypeSize getRegisterBitWidth(TargetTransformInfo::RegisterKind Vector) const;
123124
unsigned getMinVectorRegisterBitWidth() const;

llvm/lib/Target/AMDGPU/GCNSubtarget.cpp

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -52,6 +52,11 @@ static cl::opt<unsigned>
5252
cl::desc("Number of addresses from which to enable MIMG NSA."),
5353
cl::init(2), cl::Hidden);
5454

55+
static cl::opt<bool>
56+
CoerceIllegal("amdgpu-coerce-illegal-types",
57+
cl::desc("Whether or not to coerce illegal types"),
58+
cl::ReallyHidden, cl::init(false));
59+
5560
GCNSubtarget::~GCNSubtarget() = default;
5661

5762
GCNSubtarget &GCNSubtarget::initializeSubtargetDependencies(const Triple &TT,
@@ -191,6 +196,8 @@ GCNSubtarget::GCNSubtarget(const Triple &TT, StringRef GPU, StringRef FS,
191196
RegBankInfo = std::make_unique<AMDGPURegisterBankInfo>(*this);
192197
InstSelector =
193198
std::make_unique<AMDGPUInstructionSelector>(*this, *RegBankInfo, TM);
199+
200+
ShouldCoerceIllegalTypes = CoerceIllegal;
194201
}
195202

196203
const SelectionDAGTargetInfo *GCNSubtarget::getSelectionDAGInfo() const {

llvm/lib/Target/AMDGPU/GCNSubtarget.h

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -259,6 +259,8 @@ class GCNSubtarget final : public AMDGPUGenSubtargetInfo,
259259
// Dummy feature to use for assembler in tablegen.
260260
bool FeatureDisable = false;
261261

262+
bool ShouldCoerceIllegalTypes = false;
263+
262264
private:
263265
SIInstrInfo InstrInfo;
264266
SITargetLowering TLInfo;
@@ -1445,6 +1447,10 @@ class GCNSubtarget final : public AMDGPUGenSubtargetInfo,
14451447
// of sign-extending.
14461448
bool hasGetPCZeroExtension() const { return GFX12Insts; }
14471449

1450+
/// \returns whether or not we should coerce illegal types into vectors of
1451+
// legal types for values that span basic blocks.
1452+
bool shouldCoerceIllegalTypes() const { return ShouldCoerceIllegalTypes; }
1453+
14481454
/// \returns SGPR allocation granularity supported by the subtarget.
14491455
unsigned getSGPRAllocGranule() const {
14501456
return AMDGPU::IsaInfo::getSGPRAllocGranule(this);

0 commit comments

Comments
 (0)