@@ -7089,6 +7089,7 @@ class BoUpSLP::ShuffleCostEstimator : public BaseShuffleAnalysis {
7089
7089
}))) &&
7090
7090
!all_of(Gathers, [&](Value *V) { return R.getTreeEntry(V); }) &&
7091
7091
!isSplat(Gathers)) {
7092
+ InstructionCost BaseCost = R.getGatherCost(Gathers, !Root);
7092
7093
SetVector<Value *> VectorizedLoads;
7093
7094
SmallVector<unsigned> VectorizedStarts;
7094
7095
SmallVector<unsigned> ScatterVectorized;
@@ -7170,14 +7171,46 @@ class BoUpSLP::ShuffleCostEstimator : public BaseShuffleAnalysis {
7170
7171
TTI.getMemoryOpCost(Instruction::Load, LoadTy, Alignment,
7171
7172
LI->getPointerAddressSpace(), CostKind,
7172
7173
TTI::OperandValueInfo(), LI);
7174
+ // Estimate GEP cost.
7175
+ SmallVector<Value *> PointerOps(VF);
7176
+ for (auto [I, V] : enumerate(VL.slice(P, VF)))
7177
+ PointerOps[I] = cast<LoadInst>(V)->getPointerOperand();
7178
+ auto [ScalarGEPCost, VectorGEPCost] =
7179
+ getGEPCosts(TTI, PointerOps, LI->getPointerOperand(),
7180
+ Instruction::Load, CostKind, LI->getType(), LoadTy);
7181
+ GatherCost += VectorGEPCost - ScalarGEPCost;
7173
7182
}
7174
7183
for (unsigned P : ScatterVectorized) {
7175
7184
auto *LI0 = cast<LoadInst>(VL[P]);
7176
- Align CommonAlignment =
7177
- computeCommonAlignment<LoadInst>(VL.slice(P, VF) );
7185
+ ArrayRef<Value *> Slice = VL.slice(P, VF);
7186
+ Align CommonAlignment = computeCommonAlignment<LoadInst>(Slice );
7178
7187
GatherCost += TTI.getGatherScatterOpCost(
7179
7188
Instruction::Load, LoadTy, LI0->getPointerOperand(),
7180
7189
/*VariableMask=*/false, CommonAlignment, CostKind, LI0);
7190
+ // Estimate GEP cost.
7191
+ SmallVector<Value *> PointerOps(VF);
7192
+ for (auto [I, V] : enumerate(Slice))
7193
+ PointerOps[I] = cast<LoadInst>(V)->getPointerOperand();
7194
+ OrdersType Order;
7195
+ if (sortPtrAccesses(PointerOps, LI0->getType(), *R.DL, *R.SE,
7196
+ Order)) {
7197
+ // TODO: improve checks if GEPs can be vectorized.
7198
+ Value *Ptr0 = PointerOps.front();
7199
+ Type *ScalarTy = Ptr0->getType();
7200
+ auto *VecTy = FixedVectorType::get(ScalarTy, VF);
7201
+ auto [ScalarGEPCost, VectorGEPCost] =
7202
+ getGEPCosts(TTI, PointerOps, Ptr0, Instruction::GetElementPtr,
7203
+ CostKind, ScalarTy, VecTy);
7204
+ GatherCost += VectorGEPCost - ScalarGEPCost;
7205
+ if (!Order.empty()) {
7206
+ SmallVector<int> Mask;
7207
+ inversePermutation(Order, Mask);
7208
+ GatherCost += ::getShuffleCost(TTI, TTI::SK_PermuteSingleSrc,
7209
+ VecTy, Mask, CostKind);
7210
+ }
7211
+ } else {
7212
+ GatherCost += R.getGatherCost(PointerOps, /*ForPoisonSrc=*/true);
7213
+ }
7181
7214
}
7182
7215
if (NeedInsertSubvectorAnalysis) {
7183
7216
// Add the cost for the subvectors insert.
@@ -7187,6 +7220,7 @@ class BoUpSLP::ShuffleCostEstimator : public BaseShuffleAnalysis {
7187
7220
}
7188
7221
GatherCost -= ScalarsCost;
7189
7222
}
7223
+ GatherCost = std::min(BaseCost, GatherCost);
7190
7224
} else if (!Root && isSplat(VL)) {
7191
7225
// Found the broadcasting of the single scalar, calculate the cost as
7192
7226
// the broadcast.
0 commit comments