@@ -313,6 +313,23 @@ bool GCNTTIImpl::hasBranchDivergence(const Function *F) const {
313
313
return !F || !ST->isSingleLaneExecution (*F);
314
314
}
315
315
316
+ unsigned GCNTTIImpl::getNumberOfParts (Type *Tp) {
317
+ // For certain 8 bit ops, we can pack a v4i8 into a single part
318
+ // (e.g. v4i8 shufflevectors -> v_perm v4i8, v4i8). Thus, we
319
+ // do not limit the numberOfParts for 8 bit vectors to the
320
+ // legalization costs of such. It is left up to other target
321
+ // queries (e.g. get*InstrCost) to decide the proper handling
322
+ // of 8 bit vectors.
323
+ if (FixedVectorType *VTy = dyn_cast<FixedVectorType>(Tp)) {
324
+ if (DL.getTypeSizeInBits (VTy->getElementType ()) == 8 ) {
325
+ unsigned ElCount = VTy->getElementCount ().getFixedValue ();
326
+ return PowerOf2Ceil (ElCount / 4 );
327
+ }
328
+ }
329
+
330
+ return BaseT::getNumberOfParts (Tp);
331
+ }
332
+
316
333
unsigned GCNTTIImpl::getNumberOfRegisters (unsigned RCID) const {
317
334
// NB: RCID is not an RCID. In fact it is 0 or 1 for scalar or vector
318
335
// registers. See getRegisterClassForType for the implementation.
@@ -344,9 +361,11 @@ unsigned GCNTTIImpl::getMinVectorRegisterBitWidth() const {
344
361
unsigned GCNTTIImpl::getMaximumVF (unsigned ElemWidth, unsigned Opcode) const {
345
362
if (Opcode == Instruction::Load || Opcode == Instruction::Store)
346
363
return 32 * 4 / ElemWidth;
347
- return (ElemWidth == 16 && ST->has16BitInsts ()) ? 2
348
- : (ElemWidth == 32 && ST->hasPackedFP32Ops ()) ? 2
349
- : 1 ;
364
+
365
+ return (ElemWidth == 8 ) ? 4
366
+ : (ElemWidth == 16 ) ? 2
367
+ : (ElemWidth == 32 && ST->hasPackedFP32Ops ()) ? 2
368
+ : 1 ;
350
369
}
351
370
352
371
unsigned GCNTTIImpl::getLoadVectorFactor (unsigned VF, unsigned LoadSize,
@@ -1133,14 +1152,15 @@ InstructionCost GCNTTIImpl::getShuffleCost(TTI::ShuffleKind Kind,
1133
1152
1134
1153
Kind = improveShuffleKindFromMask (Kind, Mask, VT, Index, SubTp);
1135
1154
1136
- // Larger vector widths may require additional instructions, but are
1137
- // typically cheaper than scalarized versions.
1138
- unsigned NumVectorElts = cast<FixedVectorType>(VT)->getNumElements ();
1155
+ unsigned ScalarSize = DL.getTypeSizeInBits (VT->getElementType ());
1139
1156
if (ST->getGeneration () >= AMDGPUSubtarget::VOLCANIC_ISLANDS &&
1140
- DL.getTypeSizeInBits (VT->getElementType ()) == 16 ) {
1141
- bool HasVOP3P = ST->hasVOP3PInsts ();
1157
+ (ScalarSize == 16 || ScalarSize == 8 )) {
1158
+ // Larger vector widths may require additional instructions, but are
1159
+ // typically cheaper than scalarized versions.
1160
+ unsigned NumVectorElts = cast<FixedVectorType>(VT)->getNumElements ();
1142
1161
unsigned RequestedElts =
1143
1162
count_if (Mask, [](int MaskElt) { return MaskElt != -1 ; });
1163
+ unsigned EltsPerReg = 32 / ScalarSize;
1144
1164
if (RequestedElts == 0 )
1145
1165
return 0 ;
1146
1166
switch (Kind) {
@@ -1149,9 +1169,9 @@ InstructionCost GCNTTIImpl::getShuffleCost(TTI::ShuffleKind Kind,
1149
1169
case TTI::SK_PermuteSingleSrc: {
1150
1170
// With op_sel VOP3P instructions freely can access the low half or high
1151
1171
// half of a register, so any swizzle of two elements is free.
1152
- if (HasVOP3P && NumVectorElts == 2 )
1172
+ if (ST-> hasVOP3PInsts () && ScalarSize == 16 && NumVectorElts == 2 )
1153
1173
return 0 ;
1154
- unsigned NumPerms = alignTo (RequestedElts, 2 ) / 2 ;
1174
+ unsigned NumPerms = alignTo (RequestedElts, EltsPerReg ) / EltsPerReg ;
1155
1175
// SK_Broadcast just reuses the same mask
1156
1176
unsigned NumPermMasks = Kind == TTI::SK_Broadcast ? 1 : NumPerms;
1157
1177
return NumPerms + NumPermMasks;
@@ -1163,12 +1183,12 @@ InstructionCost GCNTTIImpl::getShuffleCost(TTI::ShuffleKind Kind,
1163
1183
return 0 ;
1164
1184
// Insert/extract subvectors only require shifts / extract code to get the
1165
1185
// relevant bits
1166
- return alignTo (RequestedElts, 2 ) / 2 ;
1186
+ return alignTo (RequestedElts, EltsPerReg ) / EltsPerReg ;
1167
1187
}
1168
1188
case TTI::SK_PermuteTwoSrc:
1169
1189
case TTI::SK_Splice:
1170
1190
case TTI::SK_Select: {
1171
- unsigned NumPerms = alignTo (RequestedElts, 2 ) / 2 ;
1191
+ unsigned NumPerms = alignTo (RequestedElts, EltsPerReg ) / EltsPerReg ;
1172
1192
// SK_Select just reuses the same mask
1173
1193
unsigned NumPermMasks = Kind == TTI::SK_Select ? 1 : NumPerms;
1174
1194
return NumPerms + NumPermMasks;
0 commit comments