Skip to content

Commit 703a497

Browse files
committed
Enable vectorization of i8 values.
1 parent 7aed77e commit 703a497

File tree

5 files changed

+186
-306
lines changed

5 files changed

+186
-306
lines changed

llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp

Lines changed: 34 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -344,9 +344,12 @@ unsigned GCNTTIImpl::getMinVectorRegisterBitWidth() const {
344344
unsigned GCNTTIImpl::getMaximumVF(unsigned ElemWidth, unsigned Opcode) const {
345345
if (Opcode == Instruction::Load || Opcode == Instruction::Store)
346346
return 32 * 4 / ElemWidth;
347-
return (ElemWidth == 16 && ST->has16BitInsts()) ? 2
348-
: (ElemWidth == 32 && ST->hasPackedFP32Ops()) ? 2
349-
: 1;
347+
// For a given width return the max 0number of elements that can be combined
348+
// into a wider bit value:
349+
return ElemWidth == 8 ? 4
350+
: (ElemWidth == 16 && ST->has16BitInsts()) ? 2
351+
: (ElemWidth == 32 && ST->hasPackedFP32Ops()) ? 2
352+
: 1;
350353
}
351354

352355
unsigned GCNTTIImpl::getLoadVectorFactor(unsigned VF, unsigned LoadSize,
@@ -1422,3 +1425,31 @@ void GCNTTIImpl::collectKernelLaunchBounds(
14221425
LB.push_back({"amdgpu-waves-per-eu[0]", WavesPerEU.first});
14231426
LB.push_back({"amdgpu-waves-per-eu[1]", WavesPerEU.second});
14241427
}
1428+
1429+
InstructionCost GCNTTIImpl::getMemoryOpCost(unsigned Opcode, Type *Src,
1430+
Align Alignment,
1431+
unsigned AddressSpace,
1432+
TTI::TargetCostKind CostKind,
1433+
TTI::OperandValueInfo OpInfo,
1434+
const Instruction *I) const {
1435+
if (VectorType *VecTy = dyn_cast<VectorType>(Src)) {
1436+
if ((Opcode == Instruction::Load || Opcode == Instruction::Store) &&
1437+
VecTy->getElementType()->isIntegerTy(8)) {
1438+
return ((DL.getTypeSizeInBits(VecTy) - 1) /
1439+
getLoadStoreVecRegBitWidth(AddressSpace)) +
1440+
1;
1441+
}
1442+
}
1443+
return BaseT::getMemoryOpCost(Opcode, Src, Alignment, AddressSpace, CostKind,
1444+
OpInfo, I);
1445+
}
1446+
1447+
unsigned GCNTTIImpl::getNumberOfParts(Type *Tp) const {
1448+
if (VectorType *VecTy = dyn_cast<VectorType>(Tp)) {
1449+
if (VecTy->getElementType()->isIntegerTy(8)) {
1450+
unsigned ElementCount = VecTy->getElementCount().getFixedValue();
1451+
return ((ElementCount - 1) / 4) + 1;
1452+
}
1453+
}
1454+
return BaseT::getNumberOfParts(Tp);
1455+
}

llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.h

Lines changed: 14 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -278,6 +278,20 @@ class GCNTTIImpl final : public BasicTTIImplBase<GCNTTIImpl> {
278278
void collectKernelLaunchBounds(
279279
const Function &F,
280280
SmallVectorImpl<std::pair<StringRef, int64_t>> &LB) const override;
281+
282+
/// Account for loads of i8 vector types to have reduced cost. For
283+
/// example the cost of load 4 i8s values is one is the cost of loading
284+
/// a single i32 value.
285+
InstructionCost getMemoryOpCost(
286+
unsigned Opcode, Type *Src, Align Alignment, unsigned AddressSpace,
287+
TTI::TargetCostKind CostKind,
288+
TTI::OperandValueInfo OpInfo = {TTI::OK_AnyValue, TTI::OP_None},
289+
const Instruction *I = nullptr) const override;
290+
291+
/// When counting parts on AMD GPUs, account for i8s being grouped
292+
/// together under a single i32 value. Otherwise fall back to base
293+
/// implementation.
294+
unsigned getNumberOfParts(Type *Tp) const override;
281295
};
282296

283297
} // end namespace llvm

0 commit comments

Comments
 (0)