Skip to content

Commit 29cddcf

Browse files
LukacmaAlexisPerry
authored andcommitted
[AArch64][SVE] optimisation for SVE load intrinsics with no active lanes (llvm#95269)
This patch extends llvm#73964 and adds optimisation of load SVE intrinsics when predicate is zero.
1 parent 81d59fb commit 29cddcf

File tree

2 files changed

+465
-0
lines changed

2 files changed

+465
-0
lines changed

llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp

Lines changed: 70 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -985,6 +985,33 @@ static bool isAllActivePredicate(Value *Pred) {
985985
m_ConstantInt<AArch64SVEPredPattern::all>()));
986986
}
987987

988+
// Simplify unary operation where predicate has all inactive lanes by replacing
989+
// instruction with zeroed object
990+
static std::optional<Instruction *>
991+
instCombineSVENoActiveUnaryZero(InstCombiner &IC, IntrinsicInst &II) {
992+
if (match(II.getOperand(0), m_ZeroInt())) {
993+
Constant *Node;
994+
Type *RetTy = II.getType();
995+
if (RetTy->isStructTy()) {
996+
auto StructT = cast<StructType>(RetTy);
997+
auto VecT = StructT->getElementType(0);
998+
SmallVector<llvm::Constant *, 4> ZerVec;
999+
for (unsigned i = 0; i < StructT->getNumElements(); i++) {
1000+
ZerVec.push_back(VecT->isFPOrFPVectorTy() ? ConstantFP::get(VecT, 0.0)
1001+
: ConstantInt::get(VecT, 0));
1002+
}
1003+
Node = ConstantStruct::get(StructT, ZerVec);
1004+
} else if (RetTy->isFPOrFPVectorTy())
1005+
Node = ConstantFP::get(RetTy, 0.0);
1006+
else
1007+
Node = ConstantInt::get(II.getType(), 0);
1008+
1009+
IC.replaceInstUsesWith(II, Node);
1010+
return IC.eraseInstFromFunction(II);
1011+
}
1012+
return std::nullopt;
1013+
}
1014+
9881015
static std::optional<Instruction *> instCombineSVESel(InstCombiner &IC,
9891016
IntrinsicInst &II) {
9901017
// svsel(ptrue, x, y) => x
@@ -1398,6 +1425,10 @@ instCombineSVELD1(InstCombiner &IC, IntrinsicInst &II, const DataLayout &DL) {
13981425
Value *PtrOp = II.getOperand(1);
13991426
Type *VecTy = II.getType();
14001427

1428+
// Replace by zero constant when all lanes are inactive
1429+
if (auto II_NA = instCombineSVENoActiveUnaryZero(IC, II))
1430+
return II_NA;
1431+
14011432
if (isAllActivePredicate(Pred)) {
14021433
LoadInst *Load = IC.Builder.CreateLoad(VecTy, PtrOp);
14031434
Load->copyMetadata(II);
@@ -1745,6 +1776,10 @@ instCombineLD1GatherIndex(InstCombiner &IC, IntrinsicInst &II) {
17451776
Type *Ty = II.getType();
17461777
Value *PassThru = ConstantAggregateZero::get(Ty);
17471778

1779+
// Replace by zero constant when all lanes are inactive
1780+
if (auto II_NA = instCombineSVENoActiveUnaryZero(IC, II))
1781+
return II_NA;
1782+
17481783
// Contiguous gather => masked load.
17491784
// (sve.ld1.gather.index Mask BasePtr (sve.index IndexBase 1))
17501785
// => (masked.load (gep BasePtr IndexBase) Align Mask zeroinitializer)
@@ -1971,6 +2006,41 @@ AArch64TTIImpl::instCombineIntrinsic(InstCombiner &IC,
19712006
switch (IID) {
19722007
default:
19732008
break;
2009+
2010+
case Intrinsic::aarch64_sve_ld1_gather:
2011+
case Intrinsic::aarch64_sve_ld1_gather_scalar_offset:
2012+
case Intrinsic::aarch64_sve_ld1_gather_sxtw:
2013+
case Intrinsic::aarch64_sve_ld1_gather_sxtw_index:
2014+
case Intrinsic::aarch64_sve_ld1_gather_uxtw:
2015+
case Intrinsic::aarch64_sve_ld1_gather_uxtw_index:
2016+
case Intrinsic::aarch64_sve_ld1q_gather_index:
2017+
case Intrinsic::aarch64_sve_ld1q_gather_scalar_offset:
2018+
case Intrinsic::aarch64_sve_ld1q_gather_vector_offset:
2019+
case Intrinsic::aarch64_sve_ld1ro:
2020+
case Intrinsic::aarch64_sve_ld1rq:
2021+
case Intrinsic::aarch64_sve_ld1udq:
2022+
case Intrinsic::aarch64_sve_ld1uwq:
2023+
case Intrinsic::aarch64_sve_ld2_sret:
2024+
case Intrinsic::aarch64_sve_ld2q_sret:
2025+
case Intrinsic::aarch64_sve_ld3_sret:
2026+
case Intrinsic::aarch64_sve_ld3q_sret:
2027+
case Intrinsic::aarch64_sve_ld4_sret:
2028+
case Intrinsic::aarch64_sve_ld4q_sret:
2029+
case Intrinsic::aarch64_sve_ldff1:
2030+
case Intrinsic::aarch64_sve_ldff1_gather:
2031+
case Intrinsic::aarch64_sve_ldff1_gather_index:
2032+
case Intrinsic::aarch64_sve_ldff1_gather_scalar_offset:
2033+
case Intrinsic::aarch64_sve_ldff1_gather_sxtw:
2034+
case Intrinsic::aarch64_sve_ldff1_gather_sxtw_index:
2035+
case Intrinsic::aarch64_sve_ldff1_gather_uxtw:
2036+
case Intrinsic::aarch64_sve_ldff1_gather_uxtw_index:
2037+
case Intrinsic::aarch64_sve_ldnf1:
2038+
case Intrinsic::aarch64_sve_ldnt1:
2039+
case Intrinsic::aarch64_sve_ldnt1_gather:
2040+
case Intrinsic::aarch64_sve_ldnt1_gather_index:
2041+
case Intrinsic::aarch64_sve_ldnt1_gather_scalar_offset:
2042+
case Intrinsic::aarch64_sve_ldnt1_gather_uxtw:
2043+
return instCombineSVENoActiveUnaryZero(IC, II);
19742044
case Intrinsic::aarch64_neon_fmaxnm:
19752045
case Intrinsic::aarch64_neon_fminnm:
19762046
return instCombineMaxMinNM(IC, II);

0 commit comments

Comments
 (0)