Skip to content

[AMDGPU] Vectorize i8 Shuffles #95840

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Closed
wants to merge 6 commits into from
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
142,905 changes: 142,905 additions & 0 deletions eq

Large diffs are not rendered by default.

44 changes: 32 additions & 12 deletions llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -306,6 +306,23 @@ bool GCNTTIImpl::hasBranchDivergence(const Function *F) const {
return !F || !ST->isSingleLaneExecution(*F);
}

unsigned GCNTTIImpl::getNumberOfParts(Type *Tp) { return BaseT::getNumberOfParts(Tp);
// For certain 8 bit ops, we can pack a v4i8 into a single part
// (e.g. v4i8 shufflevectors -> v_perm v4i8, v4i8). Thus, we
// do not limit the numberOfParts for 8 bit vectors to the
// legalization costs of such. It is left up to other target
// queries (e.g. get*InstrCost) to decide the proper handling
// of 8 bit vectors.
if (FixedVectorType *VTy = dyn_cast<FixedVectorType>(Tp)) {
if (DL.getTypeSizeInBits(VTy->getElementType()) == 8) {
unsigned ElCount = VTy->getElementCount().getFixedValue();
return ElCount / 4;
}
}

return BaseT::getNumberOfParts(Tp);
}

unsigned GCNTTIImpl::getNumberOfRegisters(unsigned RCID) const {
// NB: RCID is not an RCID. In fact it is 0 or 1 for scalar or vector
// registers. See getRegisterClassForType for the implementation.
Expand Down Expand Up @@ -337,9 +354,11 @@ unsigned GCNTTIImpl::getMinVectorRegisterBitWidth() const {
unsigned GCNTTIImpl::getMaximumVF(unsigned ElemWidth, unsigned Opcode) const {
if (Opcode == Instruction::Load || Opcode == Instruction::Store)
return 32 * 4 / ElemWidth;
return (ElemWidth == 16 && ST->has16BitInsts()) ? 2
: (ElemWidth == 32 && ST->hasPackedFP32Ops()) ? 2
: 1;

return (ElemWidth == 8) ? 4
: (ElemWidth == 16) ? 2
: (ElemWidth == 32 && ST->hasPackedFP32Ops()) ? 2
: 1;
}

unsigned GCNTTIImpl::getLoadVectorFactor(unsigned VF, unsigned LoadSize,
Expand Down Expand Up @@ -1140,14 +1159,15 @@ InstructionCost GCNTTIImpl::getShuffleCost(TTI::ShuffleKind Kind,

Kind = improveShuffleKindFromMask(Kind, Mask, VT, Index, SubTp);

// Larger vector widths may require additional instructions, but are
// typically cheaper than scalarized versions.
unsigned NumVectorElts = cast<FixedVectorType>(VT)->getNumElements();
unsigned ScalarSize = DL.getTypeSizeInBits(VT->getElementType());
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Is it possible to handle these cases separately? They're more straightforward than the type legalization cost

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Separate the operation / type cost changes into different PRs? That's actually what I've tried to do -- it's a PR stacking problem.

The relationship is mapped in the description, but basically this PR is meant for the changes to shufflecost and #91016 is meant for the changes required to enable SLP vectorization for i8s. The dependency is needed one way or the other to add the lit changes, and I decided to make #91016 the base as there is another PR in flight which depends on that as well.

In short, this PR is meant to address the shuffelcost changes and will be landed atomically with #91016 (if at all)

if (ST->getGeneration() >= AMDGPUSubtarget::VOLCANIC_ISLANDS &&
DL.getTypeSizeInBits(VT->getElementType()) == 16) {
bool HasVOP3P = ST->hasVOP3PInsts();
(ScalarSize == 16 || ScalarSize == 8)) {
// Larger vector widths may require additional instructions, but are
// typically cheaper than scalarized versions.
unsigned NumVectorElts = cast<FixedVectorType>(VT)->getNumElements();
unsigned RequestedElts =
count_if(Mask, [](int MaskElt) { return MaskElt != -1; });
unsigned EltsPerReg = 32 / ScalarSize;
if (RequestedElts == 0)
return 0;
switch (Kind) {
Expand All @@ -1156,9 +1176,9 @@ InstructionCost GCNTTIImpl::getShuffleCost(TTI::ShuffleKind Kind,
case TTI::SK_PermuteSingleSrc: {
// With op_sel VOP3P instructions freely can access the low half or high
// half of a register, so any swizzle of two elements is free.
if (HasVOP3P && NumVectorElts == 2)
if (ST->hasVOP3PInsts() && ScalarSize == 16 && NumVectorElts == 2)
return 0;
unsigned NumPerms = alignTo(RequestedElts, 2) / 2;
unsigned NumPerms = alignTo(RequestedElts, EltsPerReg) / EltsPerReg;
// SK_Broadcast just reuses the same mask
unsigned NumPermMasks = Kind == TTI::SK_Broadcast ? 1 : NumPerms;
return NumPerms + NumPermMasks;
Expand All @@ -1170,12 +1190,12 @@ InstructionCost GCNTTIImpl::getShuffleCost(TTI::ShuffleKind Kind,
return 0;
// Insert/extract subvectors only require shifts / extract code to get the
// relevant bits
return alignTo(RequestedElts, 2) / 2;
return alignTo(RequestedElts, EltsPerReg) / EltsPerReg;
}
case TTI::SK_PermuteTwoSrc:
case TTI::SK_Splice:
case TTI::SK_Select: {
unsigned NumPerms = alignTo(RequestedElts, 2) / 2;
unsigned NumPerms = alignTo(RequestedElts, EltsPerReg) / EltsPerReg;
// SK_Select just reuses the same mask
unsigned NumPermMasks = Kind == TTI::SK_Select ? 1 : NumPerms;
return NumPerms + NumPermMasks;
Expand Down
1 change: 1 addition & 0 deletions llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.h
Original file line number Diff line number Diff line change
Expand Up @@ -117,6 +117,7 @@ class GCNTTIImpl final : public BasicTTIImplBase<GCNTTIImpl> {
return TTI::PSK_FastHardware;
}

unsigned getNumberOfParts(Type *Tp);
unsigned getNumberOfRegisters(unsigned RCID) const;
TypeSize getRegisterBitWidth(TargetTransformInfo::RegisterKind Vector) const;
unsigned getMinVectorRegisterBitWidth() const;
Expand Down
Loading
Loading