@@ -306,6 +306,23 @@ bool GCNTTIImpl::hasBranchDivergence(const Function *F) const {
306
306
return !F || !ST->isSingleLaneExecution (*F);
307
307
}
308
308
309
+ unsigned GCNTTIImpl::getNumberOfParts (Type *Tp) {
310
+ // For certain 8 bit ops, we can pack a v4i8 into a single part
311
+ // (e.g. v4i8 shufflevectors -> v_perm v4i8, v4i8). Thus, we
312
+ // do not limit the numberOfParts for 8 bit vectors to the
313
+ // legalization costs of such. It is left up to other target
314
+ // queries (e.g. get*InstrCost) to decide the proper handling
315
+ // of 8 bit vectors.
316
+ if (FixedVectorType *VTy = dyn_cast<FixedVectorType>(Tp)) {
317
+ if (DL.getTypeSizeInBits (VTy->getElementType ()) == 8 ) {
318
+ unsigned ElCount = VTy->getElementCount ().getFixedValue ();
319
+ return PowerOf2Ceil (ElCount / 4 );
320
+ }
321
+ }
322
+
323
+ return BaseT::getNumberOfParts (Tp);
324
+ }
325
+
309
326
unsigned GCNTTIImpl::getNumberOfRegisters (unsigned RCID) const {
310
327
// NB: RCID is not an RCID. In fact it is 0 or 1 for scalar or vector
311
328
// registers. See getRegisterClassForType for the implementation.
@@ -337,9 +354,11 @@ unsigned GCNTTIImpl::getMinVectorRegisterBitWidth() const {
337
354
unsigned GCNTTIImpl::getMaximumVF (unsigned ElemWidth, unsigned Opcode) const {
338
355
if (Opcode == Instruction::Load || Opcode == Instruction::Store)
339
356
return 32 * 4 / ElemWidth;
340
- return (ElemWidth == 16 && ST->has16BitInsts ()) ? 2
341
- : (ElemWidth == 32 && ST->hasPackedFP32Ops ()) ? 2
342
- : 1 ;
357
+
358
+ return (ElemWidth == 8 ) ? 4
359
+ : (ElemWidth == 16 ) ? 2
360
+ : (ElemWidth == 32 && ST->hasPackedFP32Ops ()) ? 2
361
+ : 1 ;
343
362
}
344
363
345
364
unsigned GCNTTIImpl::getLoadVectorFactor (unsigned VF, unsigned LoadSize,
@@ -1133,14 +1152,15 @@ InstructionCost GCNTTIImpl::getShuffleCost(TTI::ShuffleKind Kind,
1133
1152
1134
1153
Kind = improveShuffleKindFromMask (Kind, Mask, VT, Index, SubTp);
1135
1154
1136
- // Larger vector widths may require additional instructions, but are
1137
- // typically cheaper than scalarized versions.
1138
- unsigned NumVectorElts = cast<FixedVectorType>(VT)->getNumElements ();
1155
+ unsigned ScalarSize = DL.getTypeSizeInBits (VT->getElementType ());
1139
1156
if (ST->getGeneration () >= AMDGPUSubtarget::VOLCANIC_ISLANDS &&
1140
- DL.getTypeSizeInBits (VT->getElementType ()) == 16 ) {
1141
- bool HasVOP3P = ST->hasVOP3PInsts ();
1157
+ (ScalarSize == 16 || ScalarSize == 8 )) {
1158
+ // Larger vector widths may require additional instructions, but are
1159
+ // typically cheaper than scalarized versions.
1160
+ unsigned NumVectorElts = cast<FixedVectorType>(VT)->getNumElements ();
1142
1161
unsigned RequestedElts =
1143
1162
count_if (Mask, [](int MaskElt) { return MaskElt != -1 ; });
1163
+ unsigned EltsPerReg = 32 / ScalarSize;
1144
1164
if (RequestedElts == 0 )
1145
1165
return 0 ;
1146
1166
switch (Kind) {
@@ -1149,9 +1169,9 @@ InstructionCost GCNTTIImpl::getShuffleCost(TTI::ShuffleKind Kind,
1149
1169
case TTI::SK_PermuteSingleSrc: {
1150
1170
// With op_sel VOP3P instructions freely can access the low half or high
1151
1171
// half of a register, so any swizzle of two elements is free.
1152
- if (HasVOP3P && NumVectorElts == 2 )
1172
+ if (ST-> hasVOP3PInsts () && ScalarSize == 16 && NumVectorElts == 2 )
1153
1173
return 0 ;
1154
- unsigned NumPerms = alignTo (RequestedElts, 2 ) / 2 ;
1174
+ unsigned NumPerms = alignTo (RequestedElts, EltsPerReg ) / EltsPerReg ;
1155
1175
// SK_Broadcast just reuses the same mask
1156
1176
unsigned NumPermMasks = Kind == TTI::SK_Broadcast ? 1 : NumPerms;
1157
1177
return NumPerms + NumPermMasks;
@@ -1163,12 +1183,12 @@ InstructionCost GCNTTIImpl::getShuffleCost(TTI::ShuffleKind Kind,
1163
1183
return 0 ;
1164
1184
// Insert/extract subvectors only require shifts / extract code to get the
1165
1185
// relevant bits
1166
- return alignTo (RequestedElts, 2 ) / 2 ;
1186
+ return alignTo (RequestedElts, EltsPerReg ) / EltsPerReg ;
1167
1187
}
1168
1188
case TTI::SK_PermuteTwoSrc:
1169
1189
case TTI::SK_Splice:
1170
1190
case TTI::SK_Select: {
1171
- unsigned NumPerms = alignTo (RequestedElts, 2 ) / 2 ;
1191
+ unsigned NumPerms = alignTo (RequestedElts, EltsPerReg ) / EltsPerReg ;
1172
1192
// SK_Select just reuses the same mask
1173
1193
unsigned NumPermMasks = Kind == TTI::SK_Select ? 1 : NumPerms;
1174
1194
return NumPerms + NumPermMasks;
0 commit comments