@@ -4000,12 +4000,14 @@ static bool isReverseOrder(ArrayRef<unsigned> Order) {
4000
4000
4001
4001
/// Checks if the given array of loads can be represented as a vectorized,
4002
4002
/// scatter or just simple gather.
4003
- static LoadsState canVectorizeLoads(ArrayRef<Value *> VL, const Value *VL0,
4003
+ static LoadsState canVectorizeLoads(const BoUpSLP &R, ArrayRef<Value *> VL,
4004
+ const Value *VL0,
4004
4005
const TargetTransformInfo &TTI,
4005
4006
const DataLayout &DL, ScalarEvolution &SE,
4006
4007
LoopInfo &LI, const TargetLibraryInfo &TLI,
4007
4008
SmallVectorImpl<unsigned> &Order,
4008
- SmallVectorImpl<Value *> &PointerOps) {
4009
+ SmallVectorImpl<Value *> &PointerOps,
4010
+ bool TryRecursiveCheck = true) {
4009
4011
// Check that a vectorized load would load the same memory as a scalar
4010
4012
// load. For example, we don't want to vectorize loads that are smaller
4011
4013
// than 8-bit. Even though we have a packed struct {<i2, i2, i2, i2>} LLVM
@@ -4098,6 +4100,78 @@ static LoadsState canVectorizeLoads(ArrayRef<Value *> VL, const Value *VL0,
4098
4100
}
4099
4101
}
4100
4102
}
4103
+ auto CheckForShuffledLoads = [&](Align CommonAlignment) {
4104
+ unsigned Sz = DL.getTypeSizeInBits(ScalarTy);
4105
+ unsigned MinVF = R.getMinVF(Sz);
4106
+ unsigned MaxVF = std::max<unsigned>(bit_floor(VL.size() / 2), MinVF);
4107
+ MaxVF = std::min(R.getMaximumVF(Sz, Instruction::Load), MaxVF);
4108
+ for (unsigned VF = MaxVF; VF >= MinVF; VF /= 2) {
4109
+ unsigned VectorizedCnt = 0;
4110
+ SmallVector<LoadsState> States;
4111
+ for (unsigned Cnt = 0, End = VL.size(); Cnt + VF <= End;
4112
+ Cnt += VF, ++VectorizedCnt) {
4113
+ ArrayRef<Value *> Slice = VL.slice(Cnt, VF);
4114
+ SmallVector<unsigned> Order;
4115
+ SmallVector<Value *> PointerOps;
4116
+ LoadsState LS =
4117
+ canVectorizeLoads(R, Slice, Slice.front(), TTI, DL, SE, LI, TLI,
4118
+ Order, PointerOps, /*TryRecursiveCheck=*/false);
4119
+ // Check that the sorted loads are consecutive.
4120
+ if (LS == LoadsState::Gather)
4121
+ break;
4122
+ // If need the reorder - consider as high-cost masked gather for now.
4123
+ if ((LS == LoadsState::Vectorize ||
4124
+ LS == LoadsState::StridedVectorize) &&
4125
+ !Order.empty() && !isReverseOrder(Order))
4126
+ LS = LoadsState::ScatterVectorize;
4127
+ States.push_back(LS);
4128
+ }
4129
+ // Can be vectorized later as a serie of loads/insertelements.
4130
+ if (VectorizedCnt == VL.size() / VF) {
4131
+ // Compare masked gather cost and loads + insersubvector costs.
4132
+ TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput;
4133
+ InstructionCost MaskedGatherCost = TTI.getGatherScatterOpCost(
4134
+ Instruction::Load, VecTy,
4135
+ cast<LoadInst>(VL0)->getPointerOperand(),
4136
+ /*VariableMask=*/false, CommonAlignment, CostKind);
4137
+ InstructionCost VecLdCost = 0;
4138
+ auto *SubVecTy = FixedVectorType::get(ScalarTy, VF);
4139
+ for (auto [I, LS] : enumerate(States)) {
4140
+ auto *LI0 = cast<LoadInst>(VL[I * VF]);
4141
+ switch (LS) {
4142
+ case LoadsState::Vectorize:
4143
+ VecLdCost += TTI.getMemoryOpCost(
4144
+ Instruction::Load, SubVecTy, LI0->getAlign(),
4145
+ LI0->getPointerAddressSpace(), CostKind,
4146
+ TTI::OperandValueInfo());
4147
+ break;
4148
+ case LoadsState::StridedVectorize:
4149
+ VecLdCost += TTI.getStridedMemoryOpCost(
4150
+ Instruction::Load, SubVecTy, LI0->getPointerOperand(),
4151
+ /*VariableMask=*/false, CommonAlignment, CostKind);
4152
+ break;
4153
+ case LoadsState::ScatterVectorize:
4154
+ VecLdCost += TTI.getGatherScatterOpCost(
4155
+ Instruction::Load, SubVecTy, LI0->getPointerOperand(),
4156
+ /*VariableMask=*/false, CommonAlignment, CostKind);
4157
+ break;
4158
+ case LoadsState::Gather:
4159
+ llvm_unreachable(
4160
+ "Expected only consecutive, strided or masked gather loads.");
4161
+ }
4162
+ VecLdCost +=
4163
+ TTI.getShuffleCost(TTI ::SK_InsertSubvector, VecTy,
4164
+ std::nullopt, CostKind, I * VF, SubVecTy);
4165
+ }
4166
+ // If masked gather cost is higher - better to vectorize, so
4167
+ // consider it as a gather node. It will be better estimated
4168
+ // later.
4169
+ if (MaskedGatherCost > VecLdCost)
4170
+ return true;
4171
+ }
4172
+ }
4173
+ return false;
4174
+ };
4101
4175
// TODO: need to improve analysis of the pointers, if not all of them are
4102
4176
// GEPs or have > 2 operands, we end up with a gather node, which just
4103
4177
// increases the cost.
@@ -4114,8 +4188,17 @@ static LoadsState canVectorizeLoads(ArrayRef<Value *> VL, const Value *VL0,
4114
4188
})) {
4115
4189
Align CommonAlignment = computeCommonAlignment<LoadInst>(VL);
4116
4190
if (TTI.isLegalMaskedGather(VecTy, CommonAlignment) &&
4117
- !TTI.forceScalarizeMaskedGather(VecTy, CommonAlignment))
4191
+ !TTI.forceScalarizeMaskedGather(VecTy, CommonAlignment)) {
4192
+ // Check if potential masked gather can be represented as series
4193
+ // of loads + insertsubvectors.
4194
+ if (TryRecursiveCheck && CheckForShuffledLoads(CommonAlignment)) {
4195
+ // If masked gather cost is higher - better to vectorize, so
4196
+ // consider it as a gather node. It will be better estimated
4197
+ // later.
4198
+ return LoadsState::Gather;
4199
+ }
4118
4200
return LoadsState::ScatterVectorize;
4201
+ }
4119
4202
}
4120
4203
}
4121
4204
@@ -5554,8 +5637,8 @@ BoUpSLP::TreeEntry::EntryState BoUpSLP::getScalarsVectorizationState(
5554
5637
// treats loading/storing it as an i8 struct. If we vectorize loads/stores
5555
5638
// from such a struct, we read/write packed bits disagreeing with the
5556
5639
// unvectorized version.
5557
- switch (canVectorizeLoads(VL, VL0, *TTI, *DL, *SE, *LI, *TLI, CurrentOrder ,
5558
- PointerOps)) {
5640
+ switch (canVectorizeLoads(*this, VL, VL0, *TTI, *DL, *SE, *LI, *TLI,
5641
+ CurrentOrder, PointerOps)) {
5559
5642
case LoadsState::Vectorize:
5560
5643
return TreeEntry::Vectorize;
5561
5644
case LoadsState::ScatterVectorize:
@@ -7336,7 +7419,7 @@ class BoUpSLP::ShuffleCostEstimator : public BaseShuffleAnalysis {
7336
7419
SmallVector<Value *> PointerOps;
7337
7420
OrdersType CurrentOrder;
7338
7421
LoadsState LS =
7339
- canVectorizeLoads(Slice, Slice.front(), TTI, *R.DL, *R.SE,
7422
+ canVectorizeLoads(R, Slice, Slice.front(), TTI, *R.DL, *R.SE,
7340
7423
*R.LI, *R.TLI, CurrentOrder, PointerOps);
7341
7424
switch (LS) {
7342
7425
case LoadsState::Vectorize:
@@ -7599,7 +7682,6 @@ class BoUpSLP::ShuffleCostEstimator : public BaseShuffleAnalysis {
7599
7682
transformMaskAfterShuffle(CommonMask, CommonMask);
7600
7683
}
7601
7684
SameNodesEstimated = false;
7602
- Cost += createShuffle(&E1, E2, Mask);
7603
7685
if (!E2 && InVectors.size() == 1) {
7604
7686
unsigned VF = E1.getVectorFactor();
7605
7687
if (Value *V1 = InVectors.front().dyn_cast<Value *>()) {
0 commit comments