Skip to content

Commit 0ed7214

Browse files
jrbyrnesbcahoon
authored andcommitted
[AMDGPU] Vectorize more 16 bit shuffles (llvm#90648)
In the case of larger vectors, we should still prefer the vectorized version (i.e. shufflevector vs extract/insert chains). In arithmetic chains, vectorization results in chains of packed math instructions (as opposed to unpack/repack & scalarized arithmetic): https://godbolt.org/z/c5onaf6G5 In chains with PHIs, vectorization again removes the unnecessary pack / repack code around BBs: https://godbolt.org/z/vz7zYzvhs Change-Id: I2da3af0c596f2e3273553642a2b27f97f10509e7 (cherry picked from commit d1e9741)
1 parent 6e449b6 commit 0ed7214

File tree

5 files changed

+1151
-956
lines changed

5 files changed

+1151
-956
lines changed

llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp

Lines changed: 44 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -1128,22 +1128,54 @@ InstructionCost GCNTTIImpl::getShuffleCost(TTI::ShuffleKind Kind,
11281128
TTI::TargetCostKind CostKind,
11291129
int Index, VectorType *SubTp,
11301130
ArrayRef<const Value *> Args) {
1131+
if (!isa<FixedVectorType>(VT))
1132+
return BaseT::getShuffleCost(Kind, VT, Mask, CostKind, Index, SubTp);
1133+
11311134
Kind = improveShuffleKindFromMask(Kind, Mask, VT, Index, SubTp);
11321135

1133-
if (ST->hasVOP3PInsts()) {
1134-
if (cast<FixedVectorType>(VT)->getNumElements() == 2 &&
1135-
DL.getTypeSizeInBits(VT->getElementType()) == 16) {
1136+
// Larger vector widths may require additional instructions, but are
1137+
// typically cheaper than scalarized versions.
1138+
unsigned NumVectorElts = cast<FixedVectorType>(VT)->getNumElements();
1139+
if (ST->getGeneration() >= AMDGPUSubtarget::VOLCANIC_ISLANDS &&
1140+
DL.getTypeSizeInBits(VT->getElementType()) == 16) {
1141+
bool HasVOP3P = ST->hasVOP3PInsts();
1142+
unsigned RequestedElts =
1143+
count_if(Mask, [](int MaskElt) { return MaskElt != -1; });
1144+
if (RequestedElts == 0)
1145+
return 0;
1146+
switch (Kind) {
1147+
case TTI::SK_Broadcast:
1148+
case TTI::SK_Reverse:
1149+
case TTI::SK_PermuteSingleSrc: {
11361150
// With op_sel VOP3P instructions freely can access the low half or high
1137-
// half of a register, so any swizzle is free.
1138-
1139-
switch (Kind) {
1140-
case TTI::SK_Broadcast:
1141-
case TTI::SK_Reverse:
1142-
case TTI::SK_PermuteSingleSrc:
1151+
// half of a register, so any swizzle of two elements is free.
1152+
if (HasVOP3P && NumVectorElts == 2)
11431153
return 0;
1144-
default:
1145-
break;
1146-
}
1154+
unsigned NumPerms = alignTo(RequestedElts, 2) / 2;
1155+
// SK_Broadcast just reuses the same mask
1156+
unsigned NumPermMasks = Kind == TTI::SK_Broadcast ? 1 : NumPerms;
1157+
return NumPerms + NumPermMasks;
1158+
}
1159+
case TTI::SK_ExtractSubvector:
1160+
case TTI::SK_InsertSubvector: {
1161+
// Even aligned accesses are free
1162+
if (!(Index % 2))
1163+
return 0;
1164+
// Insert/extract subvectors only require shifts / extract code to get the
1165+
// relevant bits
1166+
return alignTo(RequestedElts, 2) / 2;
1167+
}
1168+
case TTI::SK_PermuteTwoSrc:
1169+
case TTI::SK_Splice:
1170+
case TTI::SK_Select: {
1171+
unsigned NumPerms = alignTo(RequestedElts, 2) / 2;
1172+
// SK_Select just reuses the same mask
1173+
unsigned NumPermMasks = Kind == TTI::SK_Select ? 1 : NumPerms;
1174+
return NumPerms + NumPermMasks;
1175+
}
1176+
1177+
default:
1178+
break;
11471179
}
11481180
}
11491181

0 commit comments

Comments
 (0)