Skip to content

Commit 32c7371

Browse files
committed
Enable vectorization of i8 values.
1 parent beffd15 commit 32c7371

File tree

5 files changed

+186
-306
lines changed

5 files changed

+186
-306
lines changed

llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp

Lines changed: 34 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -344,9 +344,12 @@ unsigned GCNTTIImpl::getMinVectorRegisterBitWidth() const {
344344
unsigned GCNTTIImpl::getMaximumVF(unsigned ElemWidth, unsigned Opcode) const {
345345
if (Opcode == Instruction::Load || Opcode == Instruction::Store)
346346
return 32 * 4 / ElemWidth;
347-
return (ElemWidth == 16 && ST->has16BitInsts()) ? 2
348-
: (ElemWidth == 32 && ST->hasPackedFP32Ops()) ? 2
349-
: 1;
347+
// For a given width return the max 0number of elements that can be combined
348+
// into a wider bit value:
349+
return ElemWidth == 8 ? 4
350+
: (ElemWidth == 16 && ST->has16BitInsts()) ? 2
351+
: (ElemWidth == 32 && ST->hasPackedFP32Ops()) ? 2
352+
: 1;
350353
}
351354

352355
unsigned GCNTTIImpl::getLoadVectorFactor(unsigned VF, unsigned LoadSize,
@@ -1443,3 +1446,31 @@ void GCNTTIImpl::collectKernelLaunchBounds(
14431446
LB.push_back({"amdgpu-waves-per-eu[0]", WavesPerEU.first});
14441447
LB.push_back({"amdgpu-waves-per-eu[1]", WavesPerEU.second});
14451448
}
1449+
1450+
InstructionCost GCNTTIImpl::getMemoryOpCost(unsigned Opcode, Type *Src,
1451+
Align Alignment,
1452+
unsigned AddressSpace,
1453+
TTI::TargetCostKind CostKind,
1454+
TTI::OperandValueInfo OpInfo,
1455+
const Instruction *I) const {
1456+
if (VectorType *VecTy = dyn_cast<VectorType>(Src)) {
1457+
if ((Opcode == Instruction::Load || Opcode == Instruction::Store) &&
1458+
VecTy->getElementType()->isIntegerTy(8)) {
1459+
return ((DL.getTypeSizeInBits(VecTy) - 1) /
1460+
getLoadStoreVecRegBitWidth(AddressSpace)) +
1461+
1;
1462+
}
1463+
}
1464+
return BaseT::getMemoryOpCost(Opcode, Src, Alignment, AddressSpace, CostKind,
1465+
OpInfo, I);
1466+
}
1467+
1468+
unsigned GCNTTIImpl::getNumberOfParts(Type *Tp) const {
1469+
if (VectorType *VecTy = dyn_cast<VectorType>(Tp)) {
1470+
if (VecTy->getElementType()->isIntegerTy(8)) {
1471+
unsigned ElementCount = VecTy->getElementCount().getFixedValue();
1472+
return ((ElementCount - 1) / 4) + 1;
1473+
}
1474+
}
1475+
return BaseT::getNumberOfParts(Tp);
1476+
}

llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.h

Lines changed: 14 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -281,6 +281,20 @@ class GCNTTIImpl final : public BasicTTIImplBase<GCNTTIImpl> {
281281
void collectKernelLaunchBounds(
282282
const Function &F,
283283
SmallVectorImpl<std::pair<StringRef, int64_t>> &LB) const override;
284+
285+
/// Account for loads of i8 vector types to have reduced cost. For
286+
/// example the cost of load 4 i8s values is one is the cost of loading
287+
/// a single i32 value.
288+
InstructionCost getMemoryOpCost(
289+
unsigned Opcode, Type *Src, Align Alignment, unsigned AddressSpace,
290+
TTI::TargetCostKind CostKind,
291+
TTI::OperandValueInfo OpInfo = {TTI::OK_AnyValue, TTI::OP_None},
292+
const Instruction *I = nullptr) const override;
293+
294+
/// When counting parts on AMD GPUs, account for i8s being grouped
295+
/// together under a single i32 value. Otherwise fall back to base
296+
/// implementation.
297+
unsigned getNumberOfParts(Type *Tp) const override;
284298
};
285299

286300
} // end namespace llvm

0 commit comments

Comments
 (0)