Skip to content

[AMDGPU] Enable vectorization of i8 values. #134934

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Open
wants to merge 1 commit into
base: main
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
56 changes: 44 additions & 12 deletions llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -344,9 +344,13 @@ unsigned GCNTTIImpl::getMinVectorRegisterBitWidth() const {
unsigned GCNTTIImpl::getMaximumVF(unsigned ElemWidth, unsigned Opcode) const {
if (Opcode == Instruction::Load || Opcode == Instruction::Store)
return 32 * 4 / ElemWidth;
return (ElemWidth == 16 && ST->has16BitInsts()) ? 2
: (ElemWidth == 32 && ST->hasPackedFP32Ops()) ? 2
: 1;
// For a given width return the max 0number of elements that can be combined
// into a wider bit value:
bool isGFX8Plus = ST->getGeneration() >= AMDGPUSubtarget::VOLCANIC_ISLANDS;
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Don't do generation checks. This is already identical to has16BitInsts

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Suggested change
bool isGFX8Plus = ST->getGeneration() >= AMDGPUSubtarget::VOLCANIC_ISLANDS;

Reiterating

return (ElemWidth == 8 && isGFX8Plus) ? 4
: (ElemWidth == 16 && ST->has16BitInsts()) ? 2
: (ElemWidth == 32 && ST->hasPackedFP32Ops()) ? 2
: 1;
}

unsigned GCNTTIImpl::getLoadVectorFactor(unsigned VF, unsigned LoadSize,
Expand Down Expand Up @@ -1195,14 +1199,15 @@ InstructionCost GCNTTIImpl::getShuffleCost(TTI::ShuffleKind Kind,

Kind = improveShuffleKindFromMask(Kind, Mask, SrcTy, Index, SubTp);

// Larger vector widths may require additional instructions, but are
// typically cheaper than scalarized versions.
unsigned NumVectorElts = cast<FixedVectorType>(SrcTy)->getNumElements();
unsigned ScalarSize = DL.getTypeSizeInBits(SrcTy->getElementType());
if (ST->getGeneration() >= AMDGPUSubtarget::VOLCANIC_ISLANDS &&
DL.getTypeSizeInBits(SrcTy->getElementType()) == 16) {
bool HasVOP3P = ST->hasVOP3PInsts();
(ScalarSize == 16 || ScalarSize == 8)) {
// Larger vector widths may require additional instructions, but are
// typically cheaper than scalarized versions.
unsigned NumVectorElts = cast<FixedVectorType>(SrcTy)->getNumElements();
unsigned RequestedElts =
count_if(Mask, [](int MaskElt) { return MaskElt != -1; });
unsigned EltsPerReg = 32 / ScalarSize;
if (RequestedElts == 0)
return 0;
switch (Kind) {
Expand All @@ -1211,9 +1216,9 @@ InstructionCost GCNTTIImpl::getShuffleCost(TTI::ShuffleKind Kind,
case TTI::SK_PermuteSingleSrc: {
// With op_sel VOP3P instructions freely can access the low half or high
// half of a register, so any swizzle of two elements is free.
if (HasVOP3P && NumVectorElts == 2)
if (ST->hasVOP3PInsts() && ScalarSize == 16 && NumVectorElts == 2)
return 0;
unsigned NumPerms = alignTo(RequestedElts, 2) / 2;
unsigned NumPerms = alignTo(RequestedElts, EltsPerReg) / EltsPerReg;
// SK_Broadcast just reuses the same mask
unsigned NumPermMasks = Kind == TTI::SK_Broadcast ? 1 : NumPerms;
return NumPerms + NumPermMasks;
Expand All @@ -1225,12 +1230,12 @@ InstructionCost GCNTTIImpl::getShuffleCost(TTI::ShuffleKind Kind,
return 0;
// Insert/extract subvectors only require shifts / extract code to get the
// relevant bits
return alignTo(RequestedElts, 2) / 2;
return alignTo(RequestedElts, EltsPerReg) / EltsPerReg;
}
case TTI::SK_PermuteTwoSrc:
case TTI::SK_Splice:
case TTI::SK_Select: {
unsigned NumPerms = alignTo(RequestedElts, 2) / 2;
unsigned NumPerms = alignTo(RequestedElts, EltsPerReg) / EltsPerReg;
// SK_Select just reuses the same mask
unsigned NumPermMasks = Kind == TTI::SK_Select ? 1 : NumPerms;
return NumPerms + NumPermMasks;
Expand Down Expand Up @@ -1505,3 +1510,30 @@ GCNTTIImpl::fpenvIEEEMode(const Instruction &I) const {
return AMDGPU::isShader(F->getCallingConv()) ? KnownIEEEMode::Off
: KnownIEEEMode::On;
}

InstructionCost GCNTTIImpl::getMemoryOpCost(unsigned Opcode, Type *Src,
Align Alignment,
unsigned AddressSpace,
TTI::TargetCostKind CostKind,
TTI::OperandValueInfo OpInfo,
const Instruction *I) const {
if (VectorType *VecTy = dyn_cast<VectorType>(Src)) {
if ((Opcode == Instruction::Load || Opcode == Instruction::Store) &&
VecTy->getElementType()->isIntegerTy(8)) {
return divideCeil(DL.getTypeSizeInBits(VecTy) - 1,
getLoadStoreVecRegBitWidth(AddressSpace));
}
}
return BaseT::getMemoryOpCost(Opcode, Src, Alignment, AddressSpace, CostKind,
OpInfo, I);
}

unsigned GCNTTIImpl::getNumberOfParts(Type *Tp) const {
if (VectorType *VecTy = dyn_cast<VectorType>(Tp)) {
if (VecTy->getElementType()->isIntegerTy(8)) {
unsigned ElementCount = VecTy->getElementCount().getFixedValue();
return divideCeil(ElementCount - 1, 4);
}
}
return BaseT::getNumberOfParts(Tp);
}
14 changes: 14 additions & 0 deletions llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.h
Original file line number Diff line number Diff line change
Expand Up @@ -288,6 +288,20 @@ class GCNTTIImpl final : public BasicTTIImplBase<GCNTTIImpl> {
/// "amdgpu-ieee"="true" and KnownIEEEMode::Off if we can assume
/// "amdgpu-ieee"="false".
KnownIEEEMode fpenvIEEEMode(const Instruction &I) const;

/// Account for loads of i8 vector types to have reduced cost. For
/// example the cost of load 4 i8s values is one is the cost of loading
/// a single i32 value.
InstructionCost getMemoryOpCost(
unsigned Opcode, Type *Src, Align Alignment, unsigned AddressSpace,
TTI::TargetCostKind CostKind,
TTI::OperandValueInfo OpInfo = {TTI::OK_AnyValue, TTI::OP_None},
const Instruction *I = nullptr) const override;

/// When counting parts on AMD GPUs, account for i8s being grouped
/// together under a single i32 value. Otherwise fall back to base
/// implementation.
unsigned getNumberOfParts(Type *Tp) const override;
};

} // end namespace llvm
Expand Down
Loading