@@ -344,9 +344,12 @@ unsigned GCNTTIImpl::getMinVectorRegisterBitWidth() const {
344
344
unsigned GCNTTIImpl::getMaximumVF (unsigned ElemWidth, unsigned Opcode) const {
345
345
if (Opcode == Instruction::Load || Opcode == Instruction::Store)
346
346
return 32 * 4 / ElemWidth;
347
- return (ElemWidth == 16 && ST->has16BitInsts ()) ? 2
348
- : (ElemWidth == 32 && ST->hasPackedFP32Ops ()) ? 2
349
- : 1 ;
347
+ // For a given width return the max 0number of elements that can be combined
348
+ // into a wider bit value:
349
+ return (ElemWidth == 8 && ST->has16BitInsts ()) ? 4
350
+ : (ElemWidth == 16 && ST->has16BitInsts ()) ? 2
351
+ : (ElemWidth == 32 && ST->hasPackedFP32Ops ()) ? 2
352
+ : 1 ;
350
353
}
351
354
352
355
unsigned GCNTTIImpl::getLoadVectorFactor (unsigned VF, unsigned LoadSize,
@@ -1151,14 +1154,15 @@ InstructionCost GCNTTIImpl::getShuffleCost(TTI::ShuffleKind Kind,
1151
1154
1152
1155
Kind = improveShuffleKindFromMask (Kind, Mask, VT, Index, SubTp);
1153
1156
1154
- // Larger vector widths may require additional instructions, but are
1155
- // typically cheaper than scalarized versions.
1156
- unsigned NumVectorElts = cast<FixedVectorType>(VT)->getNumElements ();
1157
+ unsigned ScalarSize = DL.getTypeSizeInBits (VT->getElementType ());
1157
1158
if (ST->getGeneration () >= AMDGPUSubtarget::VOLCANIC_ISLANDS &&
1158
- DL.getTypeSizeInBits (VT->getElementType ()) == 16 ) {
1159
- bool HasVOP3P = ST->hasVOP3PInsts ();
1159
+ (ScalarSize == 16 || ScalarSize == 8 )) {
1160
+ // Larger vector widths may require additional instructions, but are
1161
+ // typically cheaper than scalarized versions.
1162
+ unsigned NumVectorElts = cast<FixedVectorType>(VT)->getNumElements ();
1160
1163
unsigned RequestedElts =
1161
1164
count_if (Mask, [](int MaskElt) { return MaskElt != -1 ; });
1165
+ unsigned EltsPerReg = 32 / ScalarSize;
1162
1166
if (RequestedElts == 0 )
1163
1167
return 0 ;
1164
1168
switch (Kind) {
@@ -1167,9 +1171,9 @@ InstructionCost GCNTTIImpl::getShuffleCost(TTI::ShuffleKind Kind,
1167
1171
case TTI::SK_PermuteSingleSrc: {
1168
1172
// With op_sel VOP3P instructions freely can access the low half or high
1169
1173
// half of a register, so any swizzle of two elements is free.
1170
- if (HasVOP3P && NumVectorElts == 2 )
1174
+ if (ST-> hasVOP3PInsts () && ScalarSize == 16 && NumVectorElts == 2 )
1171
1175
return 0 ;
1172
- unsigned NumPerms = alignTo (RequestedElts, 2 ) / 2 ;
1176
+ unsigned NumPerms = alignTo (RequestedElts, EltsPerReg ) / EltsPerReg ;
1173
1177
// SK_Broadcast just reuses the same mask
1174
1178
unsigned NumPermMasks = Kind == TTI::SK_Broadcast ? 1 : NumPerms;
1175
1179
return NumPerms + NumPermMasks;
@@ -1181,12 +1185,12 @@ InstructionCost GCNTTIImpl::getShuffleCost(TTI::ShuffleKind Kind,
1181
1185
return 0 ;
1182
1186
// Insert/extract subvectors only require shifts / extract code to get the
1183
1187
// relevant bits
1184
- return alignTo (RequestedElts, 2 ) / 2 ;
1188
+ return alignTo (RequestedElts, EltsPerReg ) / EltsPerReg ;
1185
1189
}
1186
1190
case TTI::SK_PermuteTwoSrc:
1187
1191
case TTI::SK_Splice:
1188
1192
case TTI::SK_Select: {
1189
- unsigned NumPerms = alignTo (RequestedElts, 2 ) / 2 ;
1193
+ unsigned NumPerms = alignTo (RequestedElts, EltsPerReg ) / EltsPerReg ;
1190
1194
// SK_Select just reuses the same mask
1191
1195
unsigned NumPermMasks = Kind == TTI::SK_Select ? 1 : NumPerms;
1192
1196
return NumPerms + NumPermMasks;
@@ -1443,3 +1447,31 @@ void GCNTTIImpl::collectKernelLaunchBounds(
1443
1447
LB.push_back ({" amdgpu-waves-per-eu[0]" , WavesPerEU.first });
1444
1448
LB.push_back ({" amdgpu-waves-per-eu[1]" , WavesPerEU.second });
1445
1449
}
1450
+
1451
+ InstructionCost GCNTTIImpl::getMemoryOpCost (unsigned Opcode, Type *Src,
1452
+ Align Alignment,
1453
+ unsigned AddressSpace,
1454
+ TTI::TargetCostKind CostKind,
1455
+ TTI::OperandValueInfo OpInfo,
1456
+ const Instruction *I) const {
1457
+ if (VectorType *VecTy = dyn_cast<VectorType>(Src)) {
1458
+ if ((Opcode == Instruction::Load || Opcode == Instruction::Store) &&
1459
+ VecTy->getElementType ()->isIntegerTy (8 )) {
1460
+ return ((DL.getTypeSizeInBits (VecTy) - 1 ) /
1461
+ getLoadStoreVecRegBitWidth (AddressSpace)) +
1462
+ 1 ;
1463
+ }
1464
+ }
1465
+ return BaseT::getMemoryOpCost (Opcode, Src, Alignment, AddressSpace, CostKind,
1466
+ OpInfo, I);
1467
+ }
1468
+
1469
+ unsigned GCNTTIImpl::getNumberOfParts (Type *Tp) const {
1470
+ if (VectorType *VecTy = dyn_cast<VectorType>(Tp)) {
1471
+ if (VecTy->getElementType ()->isIntegerTy (8 )) {
1472
+ unsigned ElementCount = VecTy->getElementCount ().getFixedValue ();
1473
+ return ((ElementCount - 1 ) / 4 ) + 1 ;
1474
+ }
1475
+ }
1476
+ return BaseT::getNumberOfParts (Tp);
1477
+ }
0 commit comments