@@ -4326,6 +4326,11 @@ calculateRtStride(ArrayRef<Value *> PointerOps, Type *ElemTy,
4326
4326
return Expander.expandCodeFor(Stride, Stride->getType(), Inst);
4327
4327
}
4328
4328
4329
+ static std::pair<InstructionCost, InstructionCost>
4330
+ getGEPCosts(const TargetTransformInfo &TTI, ArrayRef<Value *> Ptrs,
4331
+ Value *BasePtr, unsigned Opcode, TTI::TargetCostKind CostKind,
4332
+ Type *ScalarTy, VectorType *VecTy);
4333
+
4329
4334
BoUpSLP::LoadsState BoUpSLP::canVectorizeLoads(
4330
4335
ArrayRef<Value *> VL, const Value *VL0, SmallVectorImpl<unsigned> &Order,
4331
4336
SmallVectorImpl<Value *> &PointerOps, bool TryRecursiveCheck) const {
@@ -4464,31 +4469,56 @@ BoUpSLP::LoadsState BoUpSLP::canVectorizeLoads(
4464
4469
if (VectorizedCnt == VL.size() / VF) {
4465
4470
// Compare masked gather cost and loads + insersubvector costs.
4466
4471
TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput;
4467
- InstructionCost MaskedGatherCost = TTI.getGatherScatterOpCost(
4468
- Instruction::Load, VecTy,
4469
- cast<LoadInst>(VL0)->getPointerOperand(),
4470
- /*VariableMask=*/false, CommonAlignment, CostKind);
4472
+ auto [ScalarGEPCost, VectorGEPCost] = getGEPCosts(
4473
+ TTI, PointerOps, PointerOps.front(), Instruction::GetElementPtr,
4474
+ CostKind, ScalarTy, VecTy);
4475
+ InstructionCost MaskedGatherCost =
4476
+ TTI.getGatherScatterOpCost(
4477
+ Instruction::Load, VecTy,
4478
+ cast<LoadInst>(VL0)->getPointerOperand(),
4479
+ /*VariableMask=*/false, CommonAlignment, CostKind) +
4480
+ VectorGEPCost - ScalarGEPCost;
4471
4481
InstructionCost VecLdCost = 0;
4472
4482
auto *SubVecTy = FixedVectorType::get(ScalarTy, VF);
4473
4483
for (auto [I, LS] : enumerate(States)) {
4474
4484
auto *LI0 = cast<LoadInst>(VL[I * VF]);
4475
4485
switch (LS) {
4476
- case LoadsState::Vectorize:
4486
+ case LoadsState::Vectorize: {
4487
+ auto [ScalarGEPCost, VectorGEPCost] =
4488
+ getGEPCosts(TTI, ArrayRef(PointerOps).slice(I * VF, VF),
4489
+ LI0->getPointerOperand(), Instruction::Load,
4490
+ CostKind, ScalarTy, SubVecTy);
4477
4491
VecLdCost += TTI.getMemoryOpCost(
4478
- Instruction::Load, SubVecTy, LI0->getAlign(),
4479
- LI0->getPointerAddressSpace(), CostKind,
4480
- TTI::OperandValueInfo());
4492
+ Instruction::Load, SubVecTy, LI0->getAlign(),
4493
+ LI0->getPointerAddressSpace(), CostKind,
4494
+ TTI::OperandValueInfo()) +
4495
+ VectorGEPCost - ScalarGEPCost;
4481
4496
break;
4482
- case LoadsState::StridedVectorize:
4483
- VecLdCost += TTI.getStridedMemoryOpCost(
4484
- Instruction::Load, SubVecTy, LI0->getPointerOperand(),
4485
- /*VariableMask=*/false, CommonAlignment, CostKind);
4497
+ }
4498
+ case LoadsState::StridedVectorize: {
4499
+ auto [ScalarGEPCost, VectorGEPCost] =
4500
+ getGEPCosts(TTI, ArrayRef(PointerOps).slice(I * VF, VF),
4501
+ LI0->getPointerOperand(), Instruction::Load,
4502
+ CostKind, ScalarTy, SubVecTy);
4503
+ VecLdCost +=
4504
+ TTI.getStridedMemoryOpCost(
4505
+ Instruction::Load, SubVecTy, LI0->getPointerOperand(),
4506
+ /*VariableMask=*/false, CommonAlignment, CostKind) +
4507
+ VectorGEPCost - ScalarGEPCost;
4486
4508
break;
4487
- case LoadsState::ScatterVectorize:
4488
- VecLdCost += TTI.getGatherScatterOpCost(
4489
- Instruction::Load, SubVecTy, LI0->getPointerOperand(),
4490
- /*VariableMask=*/false, CommonAlignment, CostKind);
4509
+ }
4510
+ case LoadsState::ScatterVectorize: {
4511
+ auto [ScalarGEPCost, VectorGEPCost] = getGEPCosts(
4512
+ TTI, ArrayRef(PointerOps).slice(I * VF, VF),
4513
+ LI0->getPointerOperand(), Instruction::GetElementPtr,
4514
+ CostKind, ScalarTy, SubVecTy);
4515
+ VecLdCost +=
4516
+ TTI.getGatherScatterOpCost(
4517
+ Instruction::Load, SubVecTy, LI0->getPointerOperand(),
4518
+ /*VariableMask=*/false, CommonAlignment, CostKind) +
4519
+ VectorGEPCost - ScalarGEPCost;
4491
4520
break;
4521
+ }
4492
4522
case LoadsState::Gather:
4493
4523
llvm_unreachable(
4494
4524
"Expected only consecutive, strided or masked gather loads.");
@@ -4497,13 +4527,13 @@ BoUpSLP::LoadsState BoUpSLP::canVectorizeLoads(
4497
4527
for (int Idx : seq<int>(0, VL.size()))
4498
4528
ShuffleMask[Idx] = Idx / VF == I ? VL.size() + Idx % VF : Idx;
4499
4529
VecLdCost +=
4500
- TTI.getShuffleCost(TTI ::SK_InsertSubvector, VecTy,
4501
- ShuffleMask, CostKind, I * VF, SubVecTy);
4530
+ TTI.getShuffleCost(TTI::SK_InsertSubvector, VecTy, ShuffleMask ,
4531
+ CostKind, I * VF, SubVecTy);
4502
4532
}
4503
4533
// If masked gather cost is higher - better to vectorize, so
4504
4534
// consider it as a gather node. It will be better estimated
4505
4535
// later.
4506
- if (MaskedGatherCost > VecLdCost)
4536
+ if (MaskedGatherCost >= VecLdCost)
4507
4537
return true;
4508
4538
}
4509
4539
}
@@ -7951,7 +7981,13 @@ getGEPCosts(const TargetTransformInfo &TTI, ArrayRef<Value *> Ptrs,
7951
7981
7952
7982
ScalarCost =
7953
7983
TTI.getPointersChainCost(Ptrs, BasePtr, PtrsInfo, ScalarTy, CostKind);
7954
- if (auto *BaseGEP = dyn_cast<GEPOperator>(BasePtr)) {
7984
+ auto *BaseGEP = dyn_cast<GEPOperator>(BasePtr);
7985
+ if (!BaseGEP) {
7986
+ auto *It = find_if(Ptrs, IsaPred<GEPOperator>);
7987
+ if (It != Ptrs.end())
7988
+ BaseGEP = cast<GEPOperator>(*It);
7989
+ }
7990
+ if (BaseGEP) {
7955
7991
SmallVector<const Value *> Indices(BaseGEP->indices());
7956
7992
VecCost = TTI.getGEPCost(BaseGEP->getSourceElementType(),
7957
7993
BaseGEP->getPointerOperand(), Indices, VecTy,
0 commit comments