@@ -344,9 +344,12 @@ unsigned GCNTTIImpl::getMinVectorRegisterBitWidth() const {
344
344
unsigned GCNTTIImpl::getMaximumVF (unsigned ElemWidth, unsigned Opcode) const {
345
345
if (Opcode == Instruction::Load || Opcode == Instruction::Store)
346
346
return 32 * 4 / ElemWidth;
347
- return (ElemWidth == 16 && ST->has16BitInsts ()) ? 2
348
- : (ElemWidth == 32 && ST->hasPackedFP32Ops ()) ? 2
349
- : 1 ;
347
+ // For a given width return the max 0number of elements that can be combined
348
+ // into a wider bit value:
349
+ return ElemWidth == 8 ? 4
350
+ : (ElemWidth == 16 && ST->has16BitInsts ()) ? 2
351
+ : (ElemWidth == 32 && ST->hasPackedFP32Ops ()) ? 2
352
+ : 1 ;
350
353
}
351
354
352
355
unsigned GCNTTIImpl::getLoadVectorFactor (unsigned VF, unsigned LoadSize,
@@ -1422,3 +1425,31 @@ void GCNTTIImpl::collectKernelLaunchBounds(
1422
1425
LB.push_back ({" amdgpu-waves-per-eu[0]" , WavesPerEU.first });
1423
1426
LB.push_back ({" amdgpu-waves-per-eu[1]" , WavesPerEU.second });
1424
1427
}
1428
+
1429
+ InstructionCost GCNTTIImpl::getMemoryOpCost (unsigned Opcode, Type *Src,
1430
+ Align Alignment,
1431
+ unsigned AddressSpace,
1432
+ TTI::TargetCostKind CostKind,
1433
+ TTI::OperandValueInfo OpInfo,
1434
+ const Instruction *I) const {
1435
+ if (VectorType *VecTy = dyn_cast<VectorType>(Src)) {
1436
+ if ((Opcode == Instruction::Load || Opcode == Instruction::Store) &&
1437
+ VecTy->getElementType ()->isIntegerTy (8 )) {
1438
+ return ((DL.getTypeSizeInBits (VecTy) - 1 ) /
1439
+ getLoadStoreVecRegBitWidth (AddressSpace)) +
1440
+ 1 ;
1441
+ }
1442
+ }
1443
+ return BaseT::getMemoryOpCost (Opcode, Src, Alignment, AddressSpace, CostKind,
1444
+ OpInfo, I);
1445
+ }
1446
+
1447
+ unsigned GCNTTIImpl::getNumberOfParts (Type *Tp) const {
1448
+ if (VectorType *VecTy = dyn_cast<VectorType>(Tp)) {
1449
+ if (VecTy->getElementType ()->isIntegerTy (8 )) {
1450
+ unsigned ElementCount = VecTy->getElementCount ().getFixedValue ();
1451
+ return ((ElementCount - 1 ) / 4 ) + 1 ;
1452
+ }
1453
+ }
1454
+ return BaseT::getNumberOfParts (Tp);
1455
+ }
0 commit comments