@@ -344,9 +344,12 @@ unsigned GCNTTIImpl::getMinVectorRegisterBitWidth() const {
344
344
unsigned GCNTTIImpl::getMaximumVF (unsigned ElemWidth, unsigned Opcode) const {
345
345
if (Opcode == Instruction::Load || Opcode == Instruction::Store)
346
346
return 32 * 4 / ElemWidth;
347
- return (ElemWidth == 16 && ST->has16BitInsts ()) ? 2
348
- : (ElemWidth == 32 && ST->hasPackedFP32Ops ()) ? 2
349
- : 1 ;
347
+ // For a given width return the max 0number of elements that can be combined
348
+ // into a wider bit value:
349
+ return (ElemWidth == 8 && ST->has16BitInsts ()) ? 4
350
+ : (ElemWidth == 16 && ST->has16BitInsts ()) ? 2
351
+ : (ElemWidth == 32 && ST->hasPackedFP32Ops ()) ? 2
352
+ : 1 ;
350
353
}
351
354
352
355
unsigned GCNTTIImpl::getLoadVectorFactor (unsigned VF, unsigned LoadSize,
@@ -1195,14 +1198,15 @@ InstructionCost GCNTTIImpl::getShuffleCost(TTI::ShuffleKind Kind,
1195
1198
1196
1199
Kind = improveShuffleKindFromMask (Kind, Mask, SrcTy, Index, SubTp);
1197
1200
1198
- // Larger vector widths may require additional instructions, but are
1199
- // typically cheaper than scalarized versions.
1200
- unsigned NumVectorElts = cast<FixedVectorType>(SrcTy)->getNumElements ();
1201
+ unsigned ScalarSize = DL.getTypeSizeInBits (SrcTy->getElementType ());
1201
1202
if (ST->getGeneration () >= AMDGPUSubtarget::VOLCANIC_ISLANDS &&
1202
- DL.getTypeSizeInBits (SrcTy->getElementType ()) == 16 ) {
1203
- bool HasVOP3P = ST->hasVOP3PInsts ();
1203
+ (ScalarSize == 16 || ScalarSize == 8 )) {
1204
+ // Larger vector widths may require additional instructions, but are
1205
+ // typically cheaper than scalarized versions.
1206
+ unsigned NumVectorElts = cast<FixedVectorType>(SrcTy)->getNumElements ();
1204
1207
unsigned RequestedElts =
1205
1208
count_if (Mask, [](int MaskElt) { return MaskElt != -1 ; });
1209
+ unsigned EltsPerReg = 32 / ScalarSize;
1206
1210
if (RequestedElts == 0 )
1207
1211
return 0 ;
1208
1212
switch (Kind) {
@@ -1211,9 +1215,9 @@ InstructionCost GCNTTIImpl::getShuffleCost(TTI::ShuffleKind Kind,
1211
1215
case TTI::SK_PermuteSingleSrc: {
1212
1216
// With op_sel VOP3P instructions freely can access the low half or high
1213
1217
// half of a register, so any swizzle of two elements is free.
1214
- if (HasVOP3P && NumVectorElts == 2 )
1218
+ if (ST-> hasVOP3PInsts () && ScalarSize == 16 && NumVectorElts == 2 )
1215
1219
return 0 ;
1216
- unsigned NumPerms = alignTo (RequestedElts, 2 ) / 2 ;
1220
+ unsigned NumPerms = alignTo (RequestedElts, EltsPerReg ) / EltsPerReg ;
1217
1221
// SK_Broadcast just reuses the same mask
1218
1222
unsigned NumPermMasks = Kind == TTI::SK_Broadcast ? 1 : NumPerms;
1219
1223
return NumPerms + NumPermMasks;
@@ -1225,12 +1229,12 @@ InstructionCost GCNTTIImpl::getShuffleCost(TTI::ShuffleKind Kind,
1225
1229
return 0 ;
1226
1230
// Insert/extract subvectors only require shifts / extract code to get the
1227
1231
// relevant bits
1228
- return alignTo (RequestedElts, 2 ) / 2 ;
1232
+ return alignTo (RequestedElts, EltsPerReg ) / EltsPerReg ;
1229
1233
}
1230
1234
case TTI::SK_PermuteTwoSrc:
1231
1235
case TTI::SK_Splice:
1232
1236
case TTI::SK_Select: {
1233
- unsigned NumPerms = alignTo (RequestedElts, 2 ) / 2 ;
1237
+ unsigned NumPerms = alignTo (RequestedElts, EltsPerReg ) / EltsPerReg ;
1234
1238
// SK_Select just reuses the same mask
1235
1239
unsigned NumPermMasks = Kind == TTI::SK_Select ? 1 : NumPerms;
1236
1240
return NumPerms + NumPermMasks;
@@ -1505,3 +1509,30 @@ GCNTTIImpl::fpenvIEEEMode(const Instruction &I) const {
1505
1509
return AMDGPU::isShader (F->getCallingConv ()) ? KnownIEEEMode::Off
1506
1510
: KnownIEEEMode::On;
1507
1511
}
1512
+
1513
+ InstructionCost GCNTTIImpl::getMemoryOpCost (unsigned Opcode, Type *Src,
1514
+ Align Alignment,
1515
+ unsigned AddressSpace,
1516
+ TTI::TargetCostKind CostKind,
1517
+ TTI::OperandValueInfo OpInfo,
1518
+ const Instruction *I) const {
1519
+ if (VectorType *VecTy = dyn_cast<VectorType>(Src)) {
1520
+ if ((Opcode == Instruction::Load || Opcode == Instruction::Store) &&
1521
+ VecTy->getElementType ()->isIntegerTy (8 )) {
1522
+ return divideCeil (DL.getTypeSizeInBits (VecTy) - 1 ,
1523
+ getLoadStoreVecRegBitWidth (AddressSpace));
1524
+ }
1525
+ }
1526
+ return BaseT::getMemoryOpCost (Opcode, Src, Alignment, AddressSpace, CostKind,
1527
+ OpInfo, I);
1528
+ }
1529
+
1530
+ unsigned GCNTTIImpl::getNumberOfParts (Type *Tp) const {
1531
+ if (VectorType *VecTy = dyn_cast<VectorType>(Tp)) {
1532
+ if (VecTy->getElementType ()->isIntegerTy (8 )) {
1533
+ unsigned ElementCount = VecTy->getElementCount ().getFixedValue ();
1534
+ return divideCeil (ElementCount - 1 , 4 );
1535
+ }
1536
+ }
1537
+ return BaseT::getNumberOfParts (Tp);
1538
+ }
0 commit comments