@@ -2971,13 +2971,24 @@ class BoUpSLP {
2971
2971
/// (either with vector instruction or with scatter/gather
2972
2972
/// intrinsics for store/load)?
2973
2973
enum EntryState {
2974
- Vectorize,
2975
- ScatterVectorize,
2976
- StridedVectorize,
2977
- NeedToGather
2974
+ Vectorize, ///< The node is regularly vectorized.
2975
+ ScatterVectorize, ///< Masked scatter/gather node.
2976
+ StridedVectorize, ///< Strided loads (and stores)
2977
+ NeedToGather, ///< Gather/buildvector node.
2978
+ CombinedVectorize, ///< Vectorized node, combined with its user into more
2979
+ ///< complex node like select/cmp to minmax, mul/add to
2980
+ ///< fma, etc. Must be used for the following nodes in
2981
+ ///< the pattern, not the very first one.
2978
2982
};
2979
2983
EntryState State;
2980
2984
2985
+ /// List of combined opcodes supported by the vectorizer.
2986
+ enum CombinedOpcode {
2987
+ NotCombinedOp = -1,
2988
+ MinMax = Instruction::OtherOpsEnd + 1,
2989
+ };
2990
+ CombinedOpcode CombinedOp = NotCombinedOp;
2991
+
2981
2992
/// Does this sequence require some shuffling?
2982
2993
SmallVector<int, 4> ReuseShuffleIndices;
2983
2994
@@ -3165,6 +3176,9 @@ class BoUpSLP {
3165
3176
case NeedToGather:
3166
3177
dbgs() << "NeedToGather\n";
3167
3178
break;
3179
+ case CombinedVectorize:
3180
+ dbgs() << "CombinedVectorize\n";
3181
+ break;
3168
3182
}
3169
3183
dbgs() << "MainOp: ";
3170
3184
if (MainOp)
@@ -7213,6 +7227,7 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth,
7213
7227
buildTree_rec(PointerOps, Depth + 1, {TE, 0});
7214
7228
LLVM_DEBUG(dbgs() << "SLP: added a vector of non-consecutive loads.\n");
7215
7229
break;
7230
+ case TreeEntry::CombinedVectorize:
7216
7231
case TreeEntry::NeedToGather:
7217
7232
llvm_unreachable("Unexpected loads state.");
7218
7233
}
@@ -8294,6 +8309,22 @@ void BoUpSLP::transformNodes() {
8294
8309
}
8295
8310
break;
8296
8311
}
8312
+ case Instruction::Select: {
8313
+ if (E.State != TreeEntry::Vectorize)
8314
+ break;
8315
+ auto [MinMaxID, SelectOnly] = canConvertToMinOrMaxIntrinsic(E.Scalars);
8316
+ if (MinMaxID == Intrinsic::not_intrinsic)
8317
+ break;
8318
+ // This node is a minmax node.
8319
+ E.CombinedOp = TreeEntry::MinMax;
8320
+ TreeEntry *CondEntry = const_cast<TreeEntry *>(getOperandEntry(&E, 0));
8321
+ if (SelectOnly && CondEntry->UserTreeIndices.size() == 1 &&
8322
+ CondEntry->State == TreeEntry::Vectorize) {
8323
+ // The condition node is part of the combined minmax node.
8324
+ CondEntry->State = TreeEntry::CombinedVectorize;
8325
+ }
8326
+ break;
8327
+ }
8297
8328
default:
8298
8329
break;
8299
8330
}
@@ -9430,6 +9461,8 @@ BoUpSLP::getEntryCost(const TreeEntry *E, ArrayRef<Value *> VectorizedVals,
9430
9461
Instruction *VL0 = E->getMainOp();
9431
9462
unsigned ShuffleOrOp =
9432
9463
E->isAltShuffle() ? (unsigned)Instruction::ShuffleVector : E->getOpcode();
9464
+ if (E->CombinedOp != TreeEntry::NotCombinedOp)
9465
+ ShuffleOrOp = E->CombinedOp;
9433
9466
SetVector<Value *> UniqueValues(VL.begin(), VL.end());
9434
9467
const unsigned Sz = UniqueValues.size();
9435
9468
SmallBitVector UsedScalars(Sz, false);
@@ -9515,6 +9548,31 @@ BoUpSLP::getEntryCost(const TreeEntry *E, ArrayRef<Value *> VectorizedVals,
9515
9548
return VecCost - ScalarCost;
9516
9549
};
9517
9550
9551
+ auto GetMinMaxCost = [&](Type *Ty, Instruction *VI = nullptr) {
9552
+ auto [MinMaxID, SelectOnly] = canConvertToMinOrMaxIntrinsic(VI ? VI : VL);
9553
+ if (MinMaxID == Intrinsic::not_intrinsic)
9554
+ return InstructionCost::getInvalid();
9555
+ Type *CanonicalType = Ty;
9556
+ if (CanonicalType->isPtrOrPtrVectorTy())
9557
+ CanonicalType = CanonicalType->getWithNewType(IntegerType::get(
9558
+ CanonicalType->getContext(),
9559
+ DL->getTypeSizeInBits(CanonicalType->getScalarType())));
9560
+
9561
+ IntrinsicCostAttributes CostAttrs(MinMaxID, CanonicalType,
9562
+ {CanonicalType, CanonicalType});
9563
+ InstructionCost IntrinsicCost =
9564
+ TTI->getIntrinsicInstrCost(CostAttrs, CostKind);
9565
+ // If the selects are the only uses of the compares, they will be
9566
+ // dead and we can adjust the cost by removing their cost.
9567
+ if (VI && SelectOnly) {
9568
+ assert(!Ty->isVectorTy() && "Expected only for scalar type.");
9569
+ auto *CI = cast<CmpInst>(VI->getOperand(0));
9570
+ IntrinsicCost -=
9571
+ TTI->getCmpSelInstrCost(CI->getOpcode(), Ty, Builder.getInt1Ty(),
9572
+ CI->getPredicate(), CostKind, CI);
9573
+ }
9574
+ return IntrinsicCost;
9575
+ };
9518
9576
switch (ShuffleOrOp) {
9519
9577
case Instruction::PHI: {
9520
9578
// Count reused scalars.
@@ -9775,28 +9833,9 @@ BoUpSLP::getEntryCost(const TreeEntry *E, ArrayRef<Value *> VectorizedVals,
9775
9833
InstructionCost ScalarCost = TTI->getCmpSelInstrCost(
9776
9834
E->getOpcode(), OrigScalarTy, Builder.getInt1Ty(), CurrentPred,
9777
9835
CostKind, VI);
9778
- auto [MinMaxID, SelectOnly] = canConvertToMinOrMaxIntrinsic(VI);
9779
- if (MinMaxID != Intrinsic::not_intrinsic) {
9780
- Type *CanonicalType = OrigScalarTy;
9781
- if (CanonicalType->isPtrOrPtrVectorTy())
9782
- CanonicalType = CanonicalType->getWithNewType(IntegerType::get(
9783
- CanonicalType->getContext(),
9784
- DL->getTypeSizeInBits(CanonicalType->getScalarType())));
9785
-
9786
- IntrinsicCostAttributes CostAttrs(MinMaxID, CanonicalType,
9787
- {CanonicalType, CanonicalType});
9788
- InstructionCost IntrinsicCost =
9789
- TTI->getIntrinsicInstrCost(CostAttrs, CostKind);
9790
- // If the selects are the only uses of the compares, they will be
9791
- // dead and we can adjust the cost by removing their cost.
9792
- if (SelectOnly) {
9793
- auto *CI = cast<CmpInst>(VI->getOperand(0));
9794
- IntrinsicCost -= TTI->getCmpSelInstrCost(
9795
- CI->getOpcode(), OrigScalarTy, Builder.getInt1Ty(),
9796
- CI->getPredicate(), CostKind, CI);
9797
- }
9798
- ScalarCost = std::min(ScalarCost, IntrinsicCost);
9799
- }
9836
+ InstructionCost IntrinsicCost = GetMinMaxCost(OrigScalarTy, VI);
9837
+ if (IntrinsicCost.isValid())
9838
+ ScalarCost = IntrinsicCost;
9800
9839
9801
9840
return ScalarCost;
9802
9841
};
@@ -9805,30 +9844,6 @@ BoUpSLP::getEntryCost(const TreeEntry *E, ArrayRef<Value *> VectorizedVals,
9805
9844
9806
9845
InstructionCost VecCost = TTI->getCmpSelInstrCost(
9807
9846
E->getOpcode(), VecTy, MaskTy, VecPred, CostKind, VL0);
9808
- // Check if it is possible and profitable to use min/max for selects
9809
- // in VL.
9810
- //
9811
- auto [MinMaxID, SelectOnly] = canConvertToMinOrMaxIntrinsic(VL);
9812
- if (MinMaxID != Intrinsic::not_intrinsic) {
9813
- Type *CanonicalType = VecTy;
9814
- if (CanonicalType->isPtrOrPtrVectorTy())
9815
- CanonicalType = CanonicalType->getWithNewType(IntegerType::get(
9816
- CanonicalType->getContext(),
9817
- DL->getTypeSizeInBits(CanonicalType->getScalarType())));
9818
- IntrinsicCostAttributes CostAttrs(MinMaxID, CanonicalType,
9819
- {CanonicalType, CanonicalType});
9820
- InstructionCost IntrinsicCost =
9821
- TTI->getIntrinsicInstrCost(CostAttrs, CostKind);
9822
- // If the selects are the only uses of the compares, they will be
9823
- // dead and we can adjust the cost by removing their cost.
9824
- if (SelectOnly) {
9825
- auto *CI =
9826
- cast<CmpInst>(cast<Instruction>(VL.front())->getOperand(0));
9827
- IntrinsicCost -= TTI->getCmpSelInstrCost(CI->getOpcode(), VecTy,
9828
- MaskTy, VecPred, CostKind);
9829
- }
9830
- VecCost = std::min(VecCost, IntrinsicCost);
9831
- }
9832
9847
if (auto *SI = dyn_cast<SelectInst>(VL0)) {
9833
9848
auto *CondType =
9834
9849
getWidenedType(SI->getCondition()->getType(), VL.size());
@@ -9850,6 +9865,16 @@ BoUpSLP::getEntryCost(const TreeEntry *E, ArrayRef<Value *> VectorizedVals,
9850
9865
};
9851
9866
return GetCostDiff(GetScalarCost, GetVectorCost);
9852
9867
}
9868
+ case TreeEntry::MinMax: {
9869
+ auto GetScalarCost = [&](unsigned Idx) {
9870
+ return GetMinMaxCost(OrigScalarTy);
9871
+ };
9872
+ auto GetVectorCost = [&](InstructionCost CommonCost) {
9873
+ InstructionCost VecCost = GetMinMaxCost(VecTy);
9874
+ return VecCost + CommonCost;
9875
+ };
9876
+ return GetCostDiff(GetScalarCost, GetVectorCost);
9877
+ }
9853
9878
case Instruction::FNeg:
9854
9879
case Instruction::Add:
9855
9880
case Instruction::FAdd:
@@ -10588,6 +10613,15 @@ InstructionCost BoUpSLP::getTreeCost(ArrayRef<Value *> VectorizedVals) {
10588
10613
SmallPtrSet<Value *, 4> CheckedExtracts;
10589
10614
for (unsigned I = 0, E = VectorizableTree.size(); I < E; ++I) {
10590
10615
TreeEntry &TE = *VectorizableTree[I];
10616
+ // No need to count the cost for combined entries, they are combined and
10617
+ // just skip their cost.
10618
+ if (TE.State == TreeEntry::CombinedVectorize) {
10619
+ LLVM_DEBUG(
10620
+ dbgs() << "SLP: Skipping cost for combined node that starts with "
10621
+ << *TE.Scalars[0] << ".\n";
10622
+ TE.dump(); dbgs() << "SLP: Current total cost = " << Cost << "\n");
10623
+ continue;
10624
+ }
10591
10625
if (TE.isGather()) {
10592
10626
if (const TreeEntry *E = getTreeEntry(TE.getMainOp());
10593
10627
E && E->getVectorFactor() == TE.getVectorFactor() &&
@@ -12956,10 +12990,7 @@ Value *BoUpSLP::vectorizeTree(TreeEntry *E, bool PostponedPHIs) {
12956
12990
return ShuffleBuilder.finalize(E->ReuseShuffleIndices);
12957
12991
};
12958
12992
12959
- assert((E->State == TreeEntry::Vectorize ||
12960
- E->State == TreeEntry::ScatterVectorize ||
12961
- E->State == TreeEntry::StridedVectorize) &&
12962
- "Unhandled state");
12993
+ assert(!E->isGather() && "Unhandled state");
12963
12994
unsigned ShuffleOrOp =
12964
12995
E->isAltShuffle() ? (unsigned)Instruction::ShuffleVector : E->getOpcode();
12965
12996
Instruction *VL0 = E->getMainOp();
0 commit comments