@@ -313,6 +313,23 @@ bool GCNTTIImpl::hasBranchDivergence(const Function *F) const {
313
313
return !F || !ST->isSingleLaneExecution (*F);
314
314
}
315
315
316
+ unsigned GCNTTIImpl::getNumberOfParts (Type *Tp) {
317
+ // For certain 8 bit ops, we can pack a v4i8 into a single part
318
+ // (e.g. v4i8 shufflevectors -> v_perm v4i8, v4i8). Thus, we
319
+ // do not limit the numberOfParts for 8 bit vectors to the
320
+ // legalization costs of such. It is left up to other target
321
+ // queries (e.g. get*InstrCost) to decide the proper handling
322
+ // of 8 bit vectors.
323
+ if (FixedVectorType *VTy = dyn_cast<FixedVectorType>(Tp)) {
324
+ if (DL.getTypeSizeInBits (VTy->getElementType ()) == 8 ) {
325
+ unsigned ElCount = VTy->getElementCount ().getFixedValue ();
326
+ return PowerOf2Ceil (ElCount / 4 );
327
+ }
328
+ }
329
+
330
+ return BaseT::getNumberOfParts (Tp);
331
+ }
332
+
316
333
unsigned GCNTTIImpl::getNumberOfRegisters (unsigned RCID) const {
317
334
// NB: RCID is not an RCID. In fact it is 0 or 1 for scalar or vector
318
335
// registers. See getRegisterClassForType for the implementation.
@@ -344,9 +361,11 @@ unsigned GCNTTIImpl::getMinVectorRegisterBitWidth() const {
344
361
unsigned GCNTTIImpl::getMaximumVF (unsigned ElemWidth, unsigned Opcode) const {
345
362
if (Opcode == Instruction::Load || Opcode == Instruction::Store)
346
363
return 32 * 4 / ElemWidth;
347
- return (ElemWidth == 16 && ST->has16BitInsts ()) ? 2
348
- : (ElemWidth == 32 && ST->hasPackedFP32Ops ()) ? 2
349
- : 1 ;
364
+
365
+ return (ElemWidth == 8 ) ? 4
366
+ : (ElemWidth == 16 ) ? 2
367
+ : (ElemWidth == 32 && ST->hasPackedFP32Ops ()) ? 2
368
+ : 1 ;
350
369
}
351
370
352
371
unsigned GCNTTIImpl::getLoadVectorFactor (unsigned VF, unsigned LoadSize,
@@ -1154,14 +1173,15 @@ InstructionCost GCNTTIImpl::getShuffleCost(TTI::ShuffleKind Kind,
1154
1173
1155
1174
Kind = improveShuffleKindFromMask (Kind, Mask, VT, Index, SubTp);
1156
1175
1157
- // Larger vector widths may require additional instructions, but are
1158
- // typically cheaper than scalarized versions.
1159
- unsigned NumVectorElts = cast<FixedVectorType>(VT)->getNumElements ();
1176
+ unsigned ScalarSize = DL.getTypeSizeInBits (VT->getElementType ());
1160
1177
if (ST->getGeneration () >= AMDGPUSubtarget::VOLCANIC_ISLANDS &&
1161
- DL.getTypeSizeInBits (VT->getElementType ()) == 16 ) {
1162
- bool HasVOP3P = ST->hasVOP3PInsts ();
1178
+ (ScalarSize == 16 || ScalarSize == 8 )) {
1179
+ // Larger vector widths may require additional instructions, but are
1180
+ // typically cheaper than scalarized versions.
1181
+ unsigned NumVectorElts = cast<FixedVectorType>(VT)->getNumElements ();
1163
1182
unsigned RequestedElts =
1164
1183
count_if (Mask, [](int MaskElt) { return MaskElt != -1 ; });
1184
+ unsigned EltsPerReg = 32 / ScalarSize;
1165
1185
if (RequestedElts == 0 )
1166
1186
return 0 ;
1167
1187
switch (Kind) {
@@ -1170,9 +1190,9 @@ InstructionCost GCNTTIImpl::getShuffleCost(TTI::ShuffleKind Kind,
1170
1190
case TTI::SK_PermuteSingleSrc: {
1171
1191
// With op_sel VOP3P instructions freely can access the low half or high
1172
1192
// half of a register, so any swizzle of two elements is free.
1173
- if (HasVOP3P && NumVectorElts == 2 )
1193
+ if (ST-> hasVOP3PInsts () && ScalarSize == 16 && NumVectorElts == 2 )
1174
1194
return 0 ;
1175
- unsigned NumPerms = alignTo (RequestedElts, 2 ) / 2 ;
1195
+ unsigned NumPerms = alignTo (RequestedElts, EltsPerReg ) / EltsPerReg ;
1176
1196
// SK_Broadcast just reuses the same mask
1177
1197
unsigned NumPermMasks = Kind == TTI::SK_Broadcast ? 1 : NumPerms;
1178
1198
return NumPerms + NumPermMasks;
@@ -1184,12 +1204,12 @@ InstructionCost GCNTTIImpl::getShuffleCost(TTI::ShuffleKind Kind,
1184
1204
return 0 ;
1185
1205
// Insert/extract subvectors only require shifts / extract code to get the
1186
1206
// relevant bits
1187
- return alignTo (RequestedElts, 2 ) / 2 ;
1207
+ return alignTo (RequestedElts, EltsPerReg ) / EltsPerReg ;
1188
1208
}
1189
1209
case TTI::SK_PermuteTwoSrc:
1190
1210
case TTI::SK_Splice:
1191
1211
case TTI::SK_Select: {
1192
- unsigned NumPerms = alignTo (RequestedElts, 2 ) / 2 ;
1212
+ unsigned NumPerms = alignTo (RequestedElts, EltsPerReg ) / EltsPerReg ;
1193
1213
// SK_Select just reuses the same mask
1194
1214
unsigned NumPermMasks = Kind == TTI::SK_Select ? 1 : NumPerms;
1195
1215
return NumPerms + NumPermMasks;
0 commit comments