@@ -344,9 +344,13 @@ unsigned GCNTTIImpl::getMinVectorRegisterBitWidth() const {
344
344
unsigned GCNTTIImpl::getMaximumVF (unsigned ElemWidth, unsigned Opcode) const {
345
345
if (Opcode == Instruction::Load || Opcode == Instruction::Store)
346
346
return 32 * 4 / ElemWidth;
347
- return (ElemWidth == 16 && ST->has16BitInsts ()) ? 2
348
- : (ElemWidth == 32 && ST->hasPackedFP32Ops ()) ? 2
349
- : 1 ;
347
+ // For a given width return the max 0number of elements that can be combined
348
+ // into a wider bit value:
349
+ bool isGFX8Plus = ST->getGeneration () >= AMDGPUSubtarget::VOLCANIC_ISLANDS;
350
+ return (ElemWidth == 8 && isGFX8Plus) ? 4
351
+ : (ElemWidth == 16 && ST->has16BitInsts ()) ? 2
352
+ : (ElemWidth == 32 && ST->hasPackedFP32Ops ()) ? 2
353
+ : 1 ;
350
354
}
351
355
352
356
unsigned GCNTTIImpl::getLoadVectorFactor (unsigned VF, unsigned LoadSize,
@@ -1195,14 +1199,15 @@ InstructionCost GCNTTIImpl::getShuffleCost(TTI::ShuffleKind Kind,
1195
1199
1196
1200
Kind = improveShuffleKindFromMask (Kind, Mask, SrcTy, Index, SubTp);
1197
1201
1198
- // Larger vector widths may require additional instructions, but are
1199
- // typically cheaper than scalarized versions.
1200
- unsigned NumVectorElts = cast<FixedVectorType>(SrcTy)->getNumElements ();
1202
+ unsigned ScalarSize = DL.getTypeSizeInBits (SrcTy->getElementType ());
1201
1203
if (ST->getGeneration () >= AMDGPUSubtarget::VOLCANIC_ISLANDS &&
1202
- DL.getTypeSizeInBits (SrcTy->getElementType ()) == 16 ) {
1203
- bool HasVOP3P = ST->hasVOP3PInsts ();
1204
+ (ScalarSize == 16 || ScalarSize == 8 )) {
1205
+ // Larger vector widths may require additional instructions, but are
1206
+ // typically cheaper than scalarized versions.
1207
+ unsigned NumVectorElts = cast<FixedVectorType>(SrcTy)->getNumElements ();
1204
1208
unsigned RequestedElts =
1205
1209
count_if (Mask, [](int MaskElt) { return MaskElt != -1 ; });
1210
+ unsigned EltsPerReg = 32 / ScalarSize;
1206
1211
if (RequestedElts == 0 )
1207
1212
return 0 ;
1208
1213
switch (Kind) {
@@ -1211,9 +1216,9 @@ InstructionCost GCNTTIImpl::getShuffleCost(TTI::ShuffleKind Kind,
1211
1216
case TTI::SK_PermuteSingleSrc: {
1212
1217
// With op_sel VOP3P instructions freely can access the low half or high
1213
1218
// half of a register, so any swizzle of two elements is free.
1214
- if (HasVOP3P && NumVectorElts == 2 )
1219
+ if (ST-> hasVOP3PInsts () && ScalarSize == 16 && NumVectorElts == 2 )
1215
1220
return 0 ;
1216
- unsigned NumPerms = alignTo (RequestedElts, 2 ) / 2 ;
1221
+ unsigned NumPerms = alignTo (RequestedElts, EltsPerReg ) / EltsPerReg ;
1217
1222
// SK_Broadcast just reuses the same mask
1218
1223
unsigned NumPermMasks = Kind == TTI::SK_Broadcast ? 1 : NumPerms;
1219
1224
return NumPerms + NumPermMasks;
@@ -1225,12 +1230,12 @@ InstructionCost GCNTTIImpl::getShuffleCost(TTI::ShuffleKind Kind,
1225
1230
return 0 ;
1226
1231
// Insert/extract subvectors only require shifts / extract code to get the
1227
1232
// relevant bits
1228
- return alignTo (RequestedElts, 2 ) / 2 ;
1233
+ return alignTo (RequestedElts, EltsPerReg ) / EltsPerReg ;
1229
1234
}
1230
1235
case TTI::SK_PermuteTwoSrc:
1231
1236
case TTI::SK_Splice:
1232
1237
case TTI::SK_Select: {
1233
- unsigned NumPerms = alignTo (RequestedElts, 2 ) / 2 ;
1238
+ unsigned NumPerms = alignTo (RequestedElts, EltsPerReg ) / EltsPerReg ;
1234
1239
// SK_Select just reuses the same mask
1235
1240
unsigned NumPermMasks = Kind == TTI::SK_Select ? 1 : NumPerms;
1236
1241
return NumPerms + NumPermMasks;
@@ -1505,3 +1510,30 @@ GCNTTIImpl::fpenvIEEEMode(const Instruction &I) const {
1505
1510
return AMDGPU::isShader (F->getCallingConv ()) ? KnownIEEEMode::Off
1506
1511
: KnownIEEEMode::On;
1507
1512
}
1513
+
1514
+ InstructionCost GCNTTIImpl::getMemoryOpCost (unsigned Opcode, Type *Src,
1515
+ Align Alignment,
1516
+ unsigned AddressSpace,
1517
+ TTI::TargetCostKind CostKind,
1518
+ TTI::OperandValueInfo OpInfo,
1519
+ const Instruction *I) const {
1520
+ if (VectorType *VecTy = dyn_cast<VectorType>(Src)) {
1521
+ if ((Opcode == Instruction::Load || Opcode == Instruction::Store) &&
1522
+ VecTy->getElementType ()->isIntegerTy (8 )) {
1523
+ return divideCeil (DL.getTypeSizeInBits (VecTy) - 1 ,
1524
+ getLoadStoreVecRegBitWidth (AddressSpace));
1525
+ }
1526
+ }
1527
+ return BaseT::getMemoryOpCost (Opcode, Src, Alignment, AddressSpace, CostKind,
1528
+ OpInfo, I);
1529
+ }
1530
+
1531
+ unsigned GCNTTIImpl::getNumberOfParts (Type *Tp) const {
1532
+ if (VectorType *VecTy = dyn_cast<VectorType>(Tp)) {
1533
+ if (VecTy->getElementType ()->isIntegerTy (8 )) {
1534
+ unsigned ElementCount = VecTy->getElementCount ().getFixedValue ();
1535
+ return divideCeil (ElementCount - 1 , 4 );
1536
+ }
1537
+ }
1538
+ return BaseT::getNumberOfParts (Tp);
1539
+ }
0 commit comments