@@ -344,9 +344,12 @@ unsigned GCNTTIImpl::getMinVectorRegisterBitWidth() const {
344
344
unsigned GCNTTIImpl::getMaximumVF (unsigned ElemWidth, unsigned Opcode) const {
345
345
if (Opcode == Instruction::Load || Opcode == Instruction::Store)
346
346
return 32 * 4 / ElemWidth;
347
- return (ElemWidth == 16 && ST->has16BitInsts ()) ? 2
348
- : (ElemWidth == 32 && ST->hasPackedFP32Ops ()) ? 2
349
- : 1 ;
347
+ // For a given width return the max 0number of elements that can be combined
348
+ // into a wider bit value:
349
+ return ElemWidth == 8 ? 4
350
+ : (ElemWidth == 16 && ST->has16BitInsts ()) ? 2
351
+ : (ElemWidth == 32 && ST->hasPackedFP32Ops ()) ? 2
352
+ : 1 ;
350
353
}
351
354
352
355
unsigned GCNTTIImpl::getLoadVectorFactor (unsigned VF, unsigned LoadSize,
@@ -1443,3 +1446,31 @@ void GCNTTIImpl::collectKernelLaunchBounds(
1443
1446
LB.push_back ({" amdgpu-waves-per-eu[0]" , WavesPerEU.first });
1444
1447
LB.push_back ({" amdgpu-waves-per-eu[1]" , WavesPerEU.second });
1445
1448
}
1449
+
1450
+ InstructionCost GCNTTIImpl::getMemoryOpCost (unsigned Opcode, Type *Src,
1451
+ Align Alignment,
1452
+ unsigned AddressSpace,
1453
+ TTI::TargetCostKind CostKind,
1454
+ TTI::OperandValueInfo OpInfo,
1455
+ const Instruction *I) const {
1456
+ if (VectorType *VecTy = dyn_cast<VectorType>(Src)) {
1457
+ if ((Opcode == Instruction::Load || Opcode == Instruction::Store) &&
1458
+ VecTy->getElementType ()->isIntegerTy (8 )) {
1459
+ return ((DL.getTypeSizeInBits (VecTy) - 1 ) /
1460
+ getLoadStoreVecRegBitWidth (AddressSpace)) +
1461
+ 1 ;
1462
+ }
1463
+ }
1464
+ return BaseT::getMemoryOpCost (Opcode, Src, Alignment, AddressSpace, CostKind,
1465
+ OpInfo, I);
1466
+ }
1467
+
1468
+ unsigned GCNTTIImpl::getNumberOfParts (Type *Tp) const {
1469
+ if (VectorType *VecTy = dyn_cast<VectorType>(Tp)) {
1470
+ if (VecTy->getElementType ()->isIntegerTy (8 )) {
1471
+ unsigned ElementCount = VecTy->getElementCount ().getFixedValue ();
1472
+ return ((ElementCount - 1 ) / 4 ) + 1 ;
1473
+ }
1474
+ }
1475
+ return BaseT::getNumberOfParts (Tp);
1476
+ }
0 commit comments