@@ -344,9 +344,13 @@ unsigned GCNTTIImpl::getMinVectorRegisterBitWidth() const {
344
344
unsigned GCNTTIImpl::getMaximumVF (unsigned ElemWidth, unsigned Opcode) const {
345
345
if (Opcode == Instruction::Load || Opcode == Instruction::Store)
346
346
return 32 * 4 / ElemWidth;
347
- return (ElemWidth == 16 && ST->has16BitInsts ()) ? 2
348
- : (ElemWidth == 32 && ST->hasPackedFP32Ops ()) ? 2
349
- : 1 ;
347
+ // For a given width return the max 0number of elements that can be combined
348
+ // into a wider bit value:
349
+ bool isGFX8Plus = ST->getGeneration () >= AMDGPUSubtarget::VOLCANIC_ISLANDS;
350
+ return ElemWidth == 8 && isGFX8Plus ? 4
351
+ : (ElemWidth == 16 && ST->has16BitInsts ()) ? 2
352
+ : (ElemWidth == 32 && ST->hasPackedFP32Ops ()) ? 2
353
+ : 1 ;
350
354
}
351
355
352
356
unsigned GCNTTIImpl::getLoadVectorFactor (unsigned VF, unsigned LoadSize,
@@ -1151,14 +1155,15 @@ InstructionCost GCNTTIImpl::getShuffleCost(TTI::ShuffleKind Kind,
1151
1155
1152
1156
Kind = improveShuffleKindFromMask (Kind, Mask, VT, Index, SubTp);
1153
1157
1154
- // Larger vector widths may require additional instructions, but are
1155
- // typically cheaper than scalarized versions.
1156
- unsigned NumVectorElts = cast<FixedVectorType>(VT)->getNumElements ();
1158
+ unsigned ScalarSize = DL.getTypeSizeInBits (VT->getElementType ());
1157
1159
if (ST->getGeneration () >= AMDGPUSubtarget::VOLCANIC_ISLANDS &&
1158
- DL.getTypeSizeInBits (VT->getElementType ()) == 16 ) {
1159
- bool HasVOP3P = ST->hasVOP3PInsts ();
1160
+ (ScalarSize == 16 || ScalarSize == 8 )) {
1161
+ // Larger vector widths may require additional instructions, but are
1162
+ // typically cheaper than scalarized versions.
1163
+ unsigned NumVectorElts = cast<FixedVectorType>(VT)->getNumElements ();
1160
1164
unsigned RequestedElts =
1161
1165
count_if (Mask, [](int MaskElt) { return MaskElt != -1 ; });
1166
+ unsigned EltsPerReg = 32 / ScalarSize;
1162
1167
if (RequestedElts == 0 )
1163
1168
return 0 ;
1164
1169
switch (Kind) {
@@ -1167,9 +1172,9 @@ InstructionCost GCNTTIImpl::getShuffleCost(TTI::ShuffleKind Kind,
1167
1172
case TTI::SK_PermuteSingleSrc: {
1168
1173
// With op_sel VOP3P instructions freely can access the low half or high
1169
1174
// half of a register, so any swizzle of two elements is free.
1170
- if (HasVOP3P && NumVectorElts == 2 )
1175
+ if (ST-> hasVOP3PInsts () && ScalarSize == 16 && NumVectorElts == 2 )
1171
1176
return 0 ;
1172
- unsigned NumPerms = alignTo (RequestedElts, 2 ) / 2 ;
1177
+ unsigned NumPerms = alignTo (RequestedElts, EltsPerReg ) / EltsPerReg ;
1173
1178
// SK_Broadcast just reuses the same mask
1174
1179
unsigned NumPermMasks = Kind == TTI::SK_Broadcast ? 1 : NumPerms;
1175
1180
return NumPerms + NumPermMasks;
@@ -1181,12 +1186,12 @@ InstructionCost GCNTTIImpl::getShuffleCost(TTI::ShuffleKind Kind,
1181
1186
return 0 ;
1182
1187
// Insert/extract subvectors only require shifts / extract code to get the
1183
1188
// relevant bits
1184
- return alignTo (RequestedElts, 2 ) / 2 ;
1189
+ return alignTo (RequestedElts, EltsPerReg ) / EltsPerReg ;
1185
1190
}
1186
1191
case TTI::SK_PermuteTwoSrc:
1187
1192
case TTI::SK_Splice:
1188
1193
case TTI::SK_Select: {
1189
- unsigned NumPerms = alignTo (RequestedElts, 2 ) / 2 ;
1194
+ unsigned NumPerms = alignTo (RequestedElts, EltsPerReg ) / EltsPerReg ;
1190
1195
// SK_Select just reuses the same mask
1191
1196
unsigned NumPermMasks = Kind == TTI::SK_Select ? 1 : NumPerms;
1192
1197
return NumPerms + NumPermMasks;
@@ -1443,3 +1448,31 @@ void GCNTTIImpl::collectKernelLaunchBounds(
1443
1448
LB.push_back ({" amdgpu-waves-per-eu[0]" , WavesPerEU.first });
1444
1449
LB.push_back ({" amdgpu-waves-per-eu[1]" , WavesPerEU.second });
1445
1450
}
1451
+
1452
+ InstructionCost GCNTTIImpl::getMemoryOpCost (unsigned Opcode, Type *Src,
1453
+ Align Alignment,
1454
+ unsigned AddressSpace,
1455
+ TTI::TargetCostKind CostKind,
1456
+ TTI::OperandValueInfo OpInfo,
1457
+ const Instruction *I) const {
1458
+ if (VectorType *VecTy = dyn_cast<VectorType>(Src)) {
1459
+ if ((Opcode == Instruction::Load || Opcode == Instruction::Store) &&
1460
+ VecTy->getElementType ()->isIntegerTy (8 )) {
1461
+ return ((DL.getTypeSizeInBits (VecTy) - 1 ) /
1462
+ getLoadStoreVecRegBitWidth (AddressSpace)) +
1463
+ 1 ;
1464
+ }
1465
+ }
1466
+ return BaseT::getMemoryOpCost (Opcode, Src, Alignment, AddressSpace, CostKind,
1467
+ OpInfo, I);
1468
+ }
1469
+
1470
+ unsigned GCNTTIImpl::getNumberOfParts (Type *Tp) const {
1471
+ if (VectorType *VecTy = dyn_cast<VectorType>(Tp)) {
1472
+ if (VecTy->getElementType ()->isIntegerTy (8 )) {
1473
+ unsigned ElementCount = VecTy->getElementCount ().getFixedValue ();
1474
+ return ((ElementCount - 1 ) / 4 ) + 1 ;
1475
+ }
1476
+ }
1477
+ return BaseT::getNumberOfParts (Tp);
1478
+ }
0 commit comments