-
Notifications
You must be signed in to change notification settings - Fork 14.3k
[AMDGPU] Vectorize i8 Shuffles #95840
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from all commits
175740b
bc56bcd
eb89053
0d4011a
9588847
7405d61
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Large diffs are not rendered by default.
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -306,6 +306,23 @@ bool GCNTTIImpl::hasBranchDivergence(const Function *F) const { | |
return !F || !ST->isSingleLaneExecution(*F); | ||
} | ||
|
||
unsigned GCNTTIImpl::getNumberOfParts(Type *Tp) { return BaseT::getNumberOfParts(Tp); | ||
// For certain 8 bit ops, we can pack a v4i8 into a single part | ||
// (e.g. v4i8 shufflevectors -> v_perm v4i8, v4i8). Thus, we | ||
// do not limit the numberOfParts for 8 bit vectors to the | ||
// legalization costs of such. It is left up to other target | ||
// queries (e.g. get*InstrCost) to decide the proper handling | ||
// of 8 bit vectors. | ||
if (FixedVectorType *VTy = dyn_cast<FixedVectorType>(Tp)) { | ||
if (DL.getTypeSizeInBits(VTy->getElementType()) == 8) { | ||
unsigned ElCount = VTy->getElementCount().getFixedValue(); | ||
return ElCount / 4; | ||
} | ||
} | ||
|
||
return BaseT::getNumberOfParts(Tp); | ||
} | ||
|
||
unsigned GCNTTIImpl::getNumberOfRegisters(unsigned RCID) const { | ||
// NB: RCID is not an RCID. In fact it is 0 or 1 for scalar or vector | ||
// registers. See getRegisterClassForType for the implementation. | ||
|
@@ -337,9 +354,11 @@ unsigned GCNTTIImpl::getMinVectorRegisterBitWidth() const { | |
unsigned GCNTTIImpl::getMaximumVF(unsigned ElemWidth, unsigned Opcode) const { | ||
if (Opcode == Instruction::Load || Opcode == Instruction::Store) | ||
return 32 * 4 / ElemWidth; | ||
return (ElemWidth == 16 && ST->has16BitInsts()) ? 2 | ||
: (ElemWidth == 32 && ST->hasPackedFP32Ops()) ? 2 | ||
: 1; | ||
|
||
return (ElemWidth == 8) ? 4 | ||
jrbyrnes marked this conversation as resolved.
Show resolved
Hide resolved
|
||
: (ElemWidth == 16) ? 2 | ||
: (ElemWidth == 32 && ST->hasPackedFP32Ops()) ? 2 | ||
: 1; | ||
} | ||
|
||
unsigned GCNTTIImpl::getLoadVectorFactor(unsigned VF, unsigned LoadSize, | ||
|
@@ -1140,14 +1159,15 @@ InstructionCost GCNTTIImpl::getShuffleCost(TTI::ShuffleKind Kind, | |
|
||
Kind = improveShuffleKindFromMask(Kind, Mask, VT, Index, SubTp); | ||
|
||
// Larger vector widths may require additional instructions, but are | ||
// typically cheaper than scalarized versions. | ||
unsigned NumVectorElts = cast<FixedVectorType>(VT)->getNumElements(); | ||
unsigned ScalarSize = DL.getTypeSizeInBits(VT->getElementType()); | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Is it possible to handle these cases separately? They're more straightforward than the type legalization cost There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Separate the operation / type cost changes into different PRs? That's actually what I've tried to do -- it's a PR stacking problem. The relationship is mapped in the description, but basically this PR is meant for the changes to shufflecost and #91016 is meant for the changes required to enable SLP vectorization for i8s. The dependency is needed one way or the other to add the lit changes, and I decided to make #91016 the base as there is another PR in flight which depends on that as well. In short, this PR is meant to address the shuffelcost changes and will be landed atomically with #91016 (if at all) |
||
if (ST->getGeneration() >= AMDGPUSubtarget::VOLCANIC_ISLANDS && | ||
DL.getTypeSizeInBits(VT->getElementType()) == 16) { | ||
bool HasVOP3P = ST->hasVOP3PInsts(); | ||
(ScalarSize == 16 || ScalarSize == 8)) { | ||
// Larger vector widths may require additional instructions, but are | ||
// typically cheaper than scalarized versions. | ||
unsigned NumVectorElts = cast<FixedVectorType>(VT)->getNumElements(); | ||
unsigned RequestedElts = | ||
count_if(Mask, [](int MaskElt) { return MaskElt != -1; }); | ||
unsigned EltsPerReg = 32 / ScalarSize; | ||
if (RequestedElts == 0) | ||
return 0; | ||
switch (Kind) { | ||
|
@@ -1156,9 +1176,9 @@ InstructionCost GCNTTIImpl::getShuffleCost(TTI::ShuffleKind Kind, | |
case TTI::SK_PermuteSingleSrc: { | ||
// With op_sel VOP3P instructions freely can access the low half or high | ||
// half of a register, so any swizzle of two elements is free. | ||
if (HasVOP3P && NumVectorElts == 2) | ||
if (ST->hasVOP3PInsts() && ScalarSize == 16 && NumVectorElts == 2) | ||
return 0; | ||
unsigned NumPerms = alignTo(RequestedElts, 2) / 2; | ||
unsigned NumPerms = alignTo(RequestedElts, EltsPerReg) / EltsPerReg; | ||
// SK_Broadcast just reuses the same mask | ||
unsigned NumPermMasks = Kind == TTI::SK_Broadcast ? 1 : NumPerms; | ||
return NumPerms + NumPermMasks; | ||
|
@@ -1170,12 +1190,12 @@ InstructionCost GCNTTIImpl::getShuffleCost(TTI::ShuffleKind Kind, | |
return 0; | ||
// Insert/extract subvectors only require shifts / extract code to get the | ||
// relevant bits | ||
return alignTo(RequestedElts, 2) / 2; | ||
return alignTo(RequestedElts, EltsPerReg) / EltsPerReg; | ||
} | ||
case TTI::SK_PermuteTwoSrc: | ||
case TTI::SK_Splice: | ||
case TTI::SK_Select: { | ||
unsigned NumPerms = alignTo(RequestedElts, 2) / 2; | ||
unsigned NumPerms = alignTo(RequestedElts, EltsPerReg) / EltsPerReg; | ||
// SK_Select just reuses the same mask | ||
unsigned NumPermMasks = Kind == TTI::SK_Select ? 1 : NumPerms; | ||
return NumPerms + NumPermMasks; | ||
|
Uh oh!
There was an error while loading. Please reload this page.