Skip to content

Commit fc382db

Browse files
[SLP]Improve comparison of shuffled loads/masked gathers by adding GEP cost.
In some cases masked gather is less profitable than insert-subvector of consecutive/strided stores. SLP has this kind of analysis, but need to improve it by adding the cost of the GEP analysis. Also, the GEP cost estimation for masked gather is fixed. Reviewers: RKSimon Reviewed By: RKSimon Pull Request: #90737
1 parent 59ef94d commit fc382db

File tree

2 files changed

+62
-29
lines changed

2 files changed

+62
-29
lines changed

llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp

Lines changed: 56 additions & 20 deletions
Original file line numberDiff line numberDiff line change
@@ -4326,6 +4326,11 @@ calculateRtStride(ArrayRef<Value *> PointerOps, Type *ElemTy,
43264326
return Expander.expandCodeFor(Stride, Stride->getType(), Inst);
43274327
}
43284328

4329+
static std::pair<InstructionCost, InstructionCost>
4330+
getGEPCosts(const TargetTransformInfo &TTI, ArrayRef<Value *> Ptrs,
4331+
Value *BasePtr, unsigned Opcode, TTI::TargetCostKind CostKind,
4332+
Type *ScalarTy, VectorType *VecTy);
4333+
43294334
BoUpSLP::LoadsState BoUpSLP::canVectorizeLoads(
43304335
ArrayRef<Value *> VL, const Value *VL0, SmallVectorImpl<unsigned> &Order,
43314336
SmallVectorImpl<Value *> &PointerOps, bool TryRecursiveCheck) const {
@@ -4464,31 +4469,56 @@ BoUpSLP::LoadsState BoUpSLP::canVectorizeLoads(
44644469
if (VectorizedCnt == VL.size() / VF) {
44654470
// Compare masked gather cost and loads + insersubvector costs.
44664471
TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput;
4467-
InstructionCost MaskedGatherCost = TTI.getGatherScatterOpCost(
4468-
Instruction::Load, VecTy,
4469-
cast<LoadInst>(VL0)->getPointerOperand(),
4470-
/*VariableMask=*/false, CommonAlignment, CostKind);
4472+
auto [ScalarGEPCost, VectorGEPCost] = getGEPCosts(
4473+
TTI, PointerOps, PointerOps.front(), Instruction::GetElementPtr,
4474+
CostKind, ScalarTy, VecTy);
4475+
InstructionCost MaskedGatherCost =
4476+
TTI.getGatherScatterOpCost(
4477+
Instruction::Load, VecTy,
4478+
cast<LoadInst>(VL0)->getPointerOperand(),
4479+
/*VariableMask=*/false, CommonAlignment, CostKind) +
4480+
VectorGEPCost - ScalarGEPCost;
44714481
InstructionCost VecLdCost = 0;
44724482
auto *SubVecTy = FixedVectorType::get(ScalarTy, VF);
44734483
for (auto [I, LS] : enumerate(States)) {
44744484
auto *LI0 = cast<LoadInst>(VL[I * VF]);
44754485
switch (LS) {
4476-
case LoadsState::Vectorize:
4486+
case LoadsState::Vectorize: {
4487+
auto [ScalarGEPCost, VectorGEPCost] =
4488+
getGEPCosts(TTI, ArrayRef(PointerOps).slice(I * VF, VF),
4489+
LI0->getPointerOperand(), Instruction::Load,
4490+
CostKind, ScalarTy, SubVecTy);
44774491
VecLdCost += TTI.getMemoryOpCost(
4478-
Instruction::Load, SubVecTy, LI0->getAlign(),
4479-
LI0->getPointerAddressSpace(), CostKind,
4480-
TTI::OperandValueInfo());
4492+
Instruction::Load, SubVecTy, LI0->getAlign(),
4493+
LI0->getPointerAddressSpace(), CostKind,
4494+
TTI::OperandValueInfo()) +
4495+
VectorGEPCost - ScalarGEPCost;
44814496
break;
4482-
case LoadsState::StridedVectorize:
4483-
VecLdCost += TTI.getStridedMemoryOpCost(
4484-
Instruction::Load, SubVecTy, LI0->getPointerOperand(),
4485-
/*VariableMask=*/false, CommonAlignment, CostKind);
4497+
}
4498+
case LoadsState::StridedVectorize: {
4499+
auto [ScalarGEPCost, VectorGEPCost] =
4500+
getGEPCosts(TTI, ArrayRef(PointerOps).slice(I * VF, VF),
4501+
LI0->getPointerOperand(), Instruction::Load,
4502+
CostKind, ScalarTy, SubVecTy);
4503+
VecLdCost +=
4504+
TTI.getStridedMemoryOpCost(
4505+
Instruction::Load, SubVecTy, LI0->getPointerOperand(),
4506+
/*VariableMask=*/false, CommonAlignment, CostKind) +
4507+
VectorGEPCost - ScalarGEPCost;
44864508
break;
4487-
case LoadsState::ScatterVectorize:
4488-
VecLdCost += TTI.getGatherScatterOpCost(
4489-
Instruction::Load, SubVecTy, LI0->getPointerOperand(),
4490-
/*VariableMask=*/false, CommonAlignment, CostKind);
4509+
}
4510+
case LoadsState::ScatterVectorize: {
4511+
auto [ScalarGEPCost, VectorGEPCost] = getGEPCosts(
4512+
TTI, ArrayRef(PointerOps).slice(I * VF, VF),
4513+
LI0->getPointerOperand(), Instruction::GetElementPtr,
4514+
CostKind, ScalarTy, SubVecTy);
4515+
VecLdCost +=
4516+
TTI.getGatherScatterOpCost(
4517+
Instruction::Load, SubVecTy, LI0->getPointerOperand(),
4518+
/*VariableMask=*/false, CommonAlignment, CostKind) +
4519+
VectorGEPCost - ScalarGEPCost;
44914520
break;
4521+
}
44924522
case LoadsState::Gather:
44934523
llvm_unreachable(
44944524
"Expected only consecutive, strided or masked gather loads.");
@@ -4497,13 +4527,13 @@ BoUpSLP::LoadsState BoUpSLP::canVectorizeLoads(
44974527
for (int Idx : seq<int>(0, VL.size()))
44984528
ShuffleMask[Idx] = Idx / VF == I ? VL.size() + Idx % VF : Idx;
44994529
VecLdCost +=
4500-
TTI.getShuffleCost(TTI ::SK_InsertSubvector, VecTy,
4501-
ShuffleMask, CostKind, I * VF, SubVecTy);
4530+
TTI.getShuffleCost(TTI::SK_InsertSubvector, VecTy, ShuffleMask,
4531+
CostKind, I * VF, SubVecTy);
45024532
}
45034533
// If masked gather cost is higher - better to vectorize, so
45044534
// consider it as a gather node. It will be better estimated
45054535
// later.
4506-
if (MaskedGatherCost > VecLdCost)
4536+
if (MaskedGatherCost >= VecLdCost)
45074537
return true;
45084538
}
45094539
}
@@ -7951,7 +7981,13 @@ getGEPCosts(const TargetTransformInfo &TTI, ArrayRef<Value *> Ptrs,
79517981

79527982
ScalarCost =
79537983
TTI.getPointersChainCost(Ptrs, BasePtr, PtrsInfo, ScalarTy, CostKind);
7954-
if (auto *BaseGEP = dyn_cast<GEPOperator>(BasePtr)) {
7984+
auto *BaseGEP = dyn_cast<GEPOperator>(BasePtr);
7985+
if (!BaseGEP) {
7986+
auto *It = find_if(Ptrs, IsaPred<GEPOperator>);
7987+
if (It != Ptrs.end())
7988+
BaseGEP = cast<GEPOperator>(*It);
7989+
}
7990+
if (BaseGEP) {
79557991
SmallVector<const Value *> Indices(BaseGEP->indices());
79567992
VecCost = TTI.getGEPCost(BaseGEP->getSourceElementType(),
79577993
BaseGEP->getPointerOperand(), Indices, VecTy,

llvm/test/Transforms/SLPVectorizer/RISCV/combined-loads-stored.ll

Lines changed: 6 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -4,16 +4,13 @@
44
define void @test(ptr noalias %p, ptr %p1) {
55
; CHECK-LABEL: define void @test(
66
; CHECK-SAME: ptr noalias [[P:%.*]], ptr [[P1:%.*]]) #[[ATTR0:[0-9]+]] {
7-
; CHECK-NEXT: [[GEP799:%.*]] = getelementptr inbounds i8, ptr [[P]], i64 16
8-
; CHECK-NEXT: [[L3:%.*]] = load i16, ptr [[GEP799]], align 2
9-
; CHECK-NEXT: [[GEP3:%.*]] = getelementptr inbounds i8, ptr [[P]], i64 18
10-
; CHECK-NEXT: [[L4:%.*]] = load i16, ptr [[GEP3]], align 2
117
; CHECK-NEXT: [[TMP1:%.*]] = load <2 x i16>, ptr [[P]], align 2
12-
; CHECK-NEXT: store <2 x i16> [[TMP1]], ptr [[P1]], align 2
13-
; CHECK-NEXT: [[GEPS2:%.*]] = getelementptr inbounds i8, ptr [[P1]], i64 4
14-
; CHECK-NEXT: store i16 [[L3]], ptr [[GEPS2]], align 2
15-
; CHECK-NEXT: [[GEPS3:%.*]] = getelementptr inbounds i8, ptr [[P1]], i64 6
16-
; CHECK-NEXT: store i16 [[L4]], ptr [[GEPS3]], align 2
8+
; CHECK-NEXT: [[GEP2:%.*]] = getelementptr inbounds i8, ptr [[P]], i64 16
9+
; CHECK-NEXT: [[TMP2:%.*]] = load <2 x i16>, ptr [[GEP2]], align 2
10+
; CHECK-NEXT: [[TMP3:%.*]] = shufflevector <2 x i16> [[TMP1]], <2 x i16> poison, <4 x i32> <i32 0, i32 1, i32 poison, i32 poison>
11+
; CHECK-NEXT: [[TMP4:%.*]] = shufflevector <2 x i16> [[TMP2]], <2 x i16> poison, <4 x i32> <i32 0, i32 1, i32 poison, i32 poison>
12+
; CHECK-NEXT: [[TMP5:%.*]] = shufflevector <4 x i16> [[TMP3]], <4 x i16> [[TMP4]], <4 x i32> <i32 0, i32 1, i32 4, i32 5>
13+
; CHECK-NEXT: store <4 x i16> [[TMP5]], ptr [[P1]], align 2
1714
; CHECK-NEXT: ret void
1815
;
1916
%l1 = load i16, ptr %p, align 2

0 commit comments

Comments
 (0)