@@ -1154,14 +1154,15 @@ InstructionCost GCNTTIImpl::getShuffleCost(TTI::ShuffleKind Kind,
1154
1154
1155
1155
Kind = improveShuffleKindFromMask (Kind, Mask, VT, Index, SubTp);
1156
1156
1157
- // Larger vector widths may require additional instructions, but are
1158
- // typically cheaper than scalarized versions.
1159
- unsigned NumVectorElts = cast<FixedVectorType>(VT)->getNumElements ();
1157
+ unsigned ScalarSize = DL.getTypeSizeInBits (VT->getElementType ());
1160
1158
if (ST->getGeneration () >= AMDGPUSubtarget::VOLCANIC_ISLANDS &&
1161
- DL.getTypeSizeInBits (VT->getElementType ()) == 16 ) {
1162
- bool HasVOP3P = ST->hasVOP3PInsts ();
1159
+ (ScalarSize == 16 || ScalarSize == 8 )) {
1160
+ // Larger vector widths may require additional instructions, but are
1161
+ // typically cheaper than scalarized versions.
1162
+ unsigned NumVectorElts = cast<FixedVectorType>(VT)->getNumElements ();
1163
1163
unsigned RequestedElts =
1164
1164
count_if (Mask, [](int MaskElt) { return MaskElt != -1 ; });
1165
+ unsigned EltsPerReg = 32 / ScalarSize;
1165
1166
if (RequestedElts == 0 )
1166
1167
return 0 ;
1167
1168
switch (Kind) {
@@ -1170,9 +1171,9 @@ InstructionCost GCNTTIImpl::getShuffleCost(TTI::ShuffleKind Kind,
1170
1171
case TTI::SK_PermuteSingleSrc: {
1171
1172
// With op_sel VOP3P instructions freely can access the low half or high
1172
1173
// half of a register, so any swizzle of two elements is free.
1173
- if (HasVOP3P && NumVectorElts == 2 )
1174
+ if (ST-> hasVOP3PInsts () && ScalarSize == 16 && NumVectorElts == 2 )
1174
1175
return 0 ;
1175
- unsigned NumPerms = alignTo (RequestedElts, 2 ) / 2 ;
1176
+ unsigned NumPerms = alignTo (RequestedElts, EltsPerReg ) / EltsPerReg ;
1176
1177
// SK_Broadcast just reuses the same mask
1177
1178
unsigned NumPermMasks = Kind == TTI::SK_Broadcast ? 1 : NumPerms;
1178
1179
return NumPerms + NumPermMasks;
@@ -1184,12 +1185,12 @@ InstructionCost GCNTTIImpl::getShuffleCost(TTI::ShuffleKind Kind,
1184
1185
return 0 ;
1185
1186
// Insert/extract subvectors only require shifts / extract code to get the
1186
1187
// relevant bits
1187
- return alignTo (RequestedElts, 2 ) / 2 ;
1188
+ return alignTo (RequestedElts, EltsPerReg ) / EltsPerReg ;
1188
1189
}
1189
1190
case TTI::SK_PermuteTwoSrc:
1190
1191
case TTI::SK_Splice:
1191
1192
case TTI::SK_Select: {
1192
- unsigned NumPerms = alignTo (RequestedElts, 2 ) / 2 ;
1193
+ unsigned NumPerms = alignTo (RequestedElts, EltsPerReg ) / EltsPerReg ;
1193
1194
// SK_Select just reuses the same mask
1194
1195
unsigned NumPermMasks = Kind == TTI::SK_Select ? 1 : NumPerms;
1195
1196
return NumPerms + NumPermMasks;
0 commit comments