Skip to content

Commit 7e7a439

Browse files
[SLP][NFC]Introduce CombinedVectorize nodes, NFC. (#99309)
This adds combined vectorized node. It simplifies handling of the combined nodes, like select/cmp, which can be reduced to min/max, mul/add transformed to fma, etc. Improves cost mode handling and may end up with better codegen in future (direct emission of the intrinsics).
1 parent 1a92cc5 commit 7e7a439

File tree

1 file changed

+85
-54
lines changed

1 file changed

+85
-54
lines changed

llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp

Lines changed: 85 additions & 54 deletions
Original file line numberDiff line numberDiff line change
@@ -2971,13 +2971,24 @@ class BoUpSLP {
29712971
/// (either with vector instruction or with scatter/gather
29722972
/// intrinsics for store/load)?
29732973
enum EntryState {
2974-
Vectorize,
2975-
ScatterVectorize,
2976-
StridedVectorize,
2977-
NeedToGather
2974+
Vectorize, ///< The node is regularly vectorized.
2975+
ScatterVectorize, ///< Masked scatter/gather node.
2976+
StridedVectorize, ///< Strided loads (and stores)
2977+
NeedToGather, ///< Gather/buildvector node.
2978+
CombinedVectorize, ///< Vectorized node, combined with its user into more
2979+
///< complex node like select/cmp to minmax, mul/add to
2980+
///< fma, etc. Must be used for the following nodes in
2981+
///< the pattern, not the very first one.
29782982
};
29792983
EntryState State;
29802984

2985+
/// List of combined opcodes supported by the vectorizer.
2986+
enum CombinedOpcode {
2987+
NotCombinedOp = -1,
2988+
MinMax = Instruction::OtherOpsEnd + 1,
2989+
};
2990+
CombinedOpcode CombinedOp = NotCombinedOp;
2991+
29812992
/// Does this sequence require some shuffling?
29822993
SmallVector<int, 4> ReuseShuffleIndices;
29832994

@@ -3165,6 +3176,9 @@ class BoUpSLP {
31653176
case NeedToGather:
31663177
dbgs() << "NeedToGather\n";
31673178
break;
3179+
case CombinedVectorize:
3180+
dbgs() << "CombinedVectorize\n";
3181+
break;
31683182
}
31693183
dbgs() << "MainOp: ";
31703184
if (MainOp)
@@ -7213,6 +7227,7 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth,
72137227
buildTree_rec(PointerOps, Depth + 1, {TE, 0});
72147228
LLVM_DEBUG(dbgs() << "SLP: added a vector of non-consecutive loads.\n");
72157229
break;
7230+
case TreeEntry::CombinedVectorize:
72167231
case TreeEntry::NeedToGather:
72177232
llvm_unreachable("Unexpected loads state.");
72187233
}
@@ -8294,6 +8309,22 @@ void BoUpSLP::transformNodes() {
82948309
}
82958310
break;
82968311
}
8312+
case Instruction::Select: {
8313+
if (E.State != TreeEntry::Vectorize)
8314+
break;
8315+
auto [MinMaxID, SelectOnly] = canConvertToMinOrMaxIntrinsic(E.Scalars);
8316+
if (MinMaxID == Intrinsic::not_intrinsic)
8317+
break;
8318+
// This node is a minmax node.
8319+
E.CombinedOp = TreeEntry::MinMax;
8320+
TreeEntry *CondEntry = const_cast<TreeEntry *>(getOperandEntry(&E, 0));
8321+
if (SelectOnly && CondEntry->UserTreeIndices.size() == 1 &&
8322+
CondEntry->State == TreeEntry::Vectorize) {
8323+
// The condition node is part of the combined minmax node.
8324+
CondEntry->State = TreeEntry::CombinedVectorize;
8325+
}
8326+
break;
8327+
}
82978328
default:
82988329
break;
82998330
}
@@ -9430,6 +9461,8 @@ BoUpSLP::getEntryCost(const TreeEntry *E, ArrayRef<Value *> VectorizedVals,
94309461
Instruction *VL0 = E->getMainOp();
94319462
unsigned ShuffleOrOp =
94329463
E->isAltShuffle() ? (unsigned)Instruction::ShuffleVector : E->getOpcode();
9464+
if (E->CombinedOp != TreeEntry::NotCombinedOp)
9465+
ShuffleOrOp = E->CombinedOp;
94339466
SetVector<Value *> UniqueValues(VL.begin(), VL.end());
94349467
const unsigned Sz = UniqueValues.size();
94359468
SmallBitVector UsedScalars(Sz, false);
@@ -9515,6 +9548,31 @@ BoUpSLP::getEntryCost(const TreeEntry *E, ArrayRef<Value *> VectorizedVals,
95159548
return VecCost - ScalarCost;
95169549
};
95179550

9551+
auto GetMinMaxCost = [&](Type *Ty, Instruction *VI = nullptr) {
9552+
auto [MinMaxID, SelectOnly] = canConvertToMinOrMaxIntrinsic(VI ? VI : VL);
9553+
if (MinMaxID == Intrinsic::not_intrinsic)
9554+
return InstructionCost::getInvalid();
9555+
Type *CanonicalType = Ty;
9556+
if (CanonicalType->isPtrOrPtrVectorTy())
9557+
CanonicalType = CanonicalType->getWithNewType(IntegerType::get(
9558+
CanonicalType->getContext(),
9559+
DL->getTypeSizeInBits(CanonicalType->getScalarType())));
9560+
9561+
IntrinsicCostAttributes CostAttrs(MinMaxID, CanonicalType,
9562+
{CanonicalType, CanonicalType});
9563+
InstructionCost IntrinsicCost =
9564+
TTI->getIntrinsicInstrCost(CostAttrs, CostKind);
9565+
// If the selects are the only uses of the compares, they will be
9566+
// dead and we can adjust the cost by removing their cost.
9567+
if (VI && SelectOnly) {
9568+
assert(!Ty->isVectorTy() && "Expected only for scalar type.");
9569+
auto *CI = cast<CmpInst>(VI->getOperand(0));
9570+
IntrinsicCost -=
9571+
TTI->getCmpSelInstrCost(CI->getOpcode(), Ty, Builder.getInt1Ty(),
9572+
CI->getPredicate(), CostKind, CI);
9573+
}
9574+
return IntrinsicCost;
9575+
};
95189576
switch (ShuffleOrOp) {
95199577
case Instruction::PHI: {
95209578
// Count reused scalars.
@@ -9775,28 +9833,9 @@ BoUpSLP::getEntryCost(const TreeEntry *E, ArrayRef<Value *> VectorizedVals,
97759833
InstructionCost ScalarCost = TTI->getCmpSelInstrCost(
97769834
E->getOpcode(), OrigScalarTy, Builder.getInt1Ty(), CurrentPred,
97779835
CostKind, VI);
9778-
auto [MinMaxID, SelectOnly] = canConvertToMinOrMaxIntrinsic(VI);
9779-
if (MinMaxID != Intrinsic::not_intrinsic) {
9780-
Type *CanonicalType = OrigScalarTy;
9781-
if (CanonicalType->isPtrOrPtrVectorTy())
9782-
CanonicalType = CanonicalType->getWithNewType(IntegerType::get(
9783-
CanonicalType->getContext(),
9784-
DL->getTypeSizeInBits(CanonicalType->getScalarType())));
9785-
9786-
IntrinsicCostAttributes CostAttrs(MinMaxID, CanonicalType,
9787-
{CanonicalType, CanonicalType});
9788-
InstructionCost IntrinsicCost =
9789-
TTI->getIntrinsicInstrCost(CostAttrs, CostKind);
9790-
// If the selects are the only uses of the compares, they will be
9791-
// dead and we can adjust the cost by removing their cost.
9792-
if (SelectOnly) {
9793-
auto *CI = cast<CmpInst>(VI->getOperand(0));
9794-
IntrinsicCost -= TTI->getCmpSelInstrCost(
9795-
CI->getOpcode(), OrigScalarTy, Builder.getInt1Ty(),
9796-
CI->getPredicate(), CostKind, CI);
9797-
}
9798-
ScalarCost = std::min(ScalarCost, IntrinsicCost);
9799-
}
9836+
InstructionCost IntrinsicCost = GetMinMaxCost(OrigScalarTy, VI);
9837+
if (IntrinsicCost.isValid())
9838+
ScalarCost = IntrinsicCost;
98009839

98019840
return ScalarCost;
98029841
};
@@ -9805,30 +9844,6 @@ BoUpSLP::getEntryCost(const TreeEntry *E, ArrayRef<Value *> VectorizedVals,
98059844

98069845
InstructionCost VecCost = TTI->getCmpSelInstrCost(
98079846
E->getOpcode(), VecTy, MaskTy, VecPred, CostKind, VL0);
9808-
// Check if it is possible and profitable to use min/max for selects
9809-
// in VL.
9810-
//
9811-
auto [MinMaxID, SelectOnly] = canConvertToMinOrMaxIntrinsic(VL);
9812-
if (MinMaxID != Intrinsic::not_intrinsic) {
9813-
Type *CanonicalType = VecTy;
9814-
if (CanonicalType->isPtrOrPtrVectorTy())
9815-
CanonicalType = CanonicalType->getWithNewType(IntegerType::get(
9816-
CanonicalType->getContext(),
9817-
DL->getTypeSizeInBits(CanonicalType->getScalarType())));
9818-
IntrinsicCostAttributes CostAttrs(MinMaxID, CanonicalType,
9819-
{CanonicalType, CanonicalType});
9820-
InstructionCost IntrinsicCost =
9821-
TTI->getIntrinsicInstrCost(CostAttrs, CostKind);
9822-
// If the selects are the only uses of the compares, they will be
9823-
// dead and we can adjust the cost by removing their cost.
9824-
if (SelectOnly) {
9825-
auto *CI =
9826-
cast<CmpInst>(cast<Instruction>(VL.front())->getOperand(0));
9827-
IntrinsicCost -= TTI->getCmpSelInstrCost(CI->getOpcode(), VecTy,
9828-
MaskTy, VecPred, CostKind);
9829-
}
9830-
VecCost = std::min(VecCost, IntrinsicCost);
9831-
}
98329847
if (auto *SI = dyn_cast<SelectInst>(VL0)) {
98339848
auto *CondType =
98349849
getWidenedType(SI->getCondition()->getType(), VL.size());
@@ -9850,6 +9865,16 @@ BoUpSLP::getEntryCost(const TreeEntry *E, ArrayRef<Value *> VectorizedVals,
98509865
};
98519866
return GetCostDiff(GetScalarCost, GetVectorCost);
98529867
}
9868+
case TreeEntry::MinMax: {
9869+
auto GetScalarCost = [&](unsigned Idx) {
9870+
return GetMinMaxCost(OrigScalarTy);
9871+
};
9872+
auto GetVectorCost = [&](InstructionCost CommonCost) {
9873+
InstructionCost VecCost = GetMinMaxCost(VecTy);
9874+
return VecCost + CommonCost;
9875+
};
9876+
return GetCostDiff(GetScalarCost, GetVectorCost);
9877+
}
98539878
case Instruction::FNeg:
98549879
case Instruction::Add:
98559880
case Instruction::FAdd:
@@ -10588,6 +10613,15 @@ InstructionCost BoUpSLP::getTreeCost(ArrayRef<Value *> VectorizedVals) {
1058810613
SmallPtrSet<Value *, 4> CheckedExtracts;
1058910614
for (unsigned I = 0, E = VectorizableTree.size(); I < E; ++I) {
1059010615
TreeEntry &TE = *VectorizableTree[I];
10616+
// No need to count the cost for combined entries, they are combined and
10617+
// just skip their cost.
10618+
if (TE.State == TreeEntry::CombinedVectorize) {
10619+
LLVM_DEBUG(
10620+
dbgs() << "SLP: Skipping cost for combined node that starts with "
10621+
<< *TE.Scalars[0] << ".\n";
10622+
TE.dump(); dbgs() << "SLP: Current total cost = " << Cost << "\n");
10623+
continue;
10624+
}
1059110625
if (TE.isGather()) {
1059210626
if (const TreeEntry *E = getTreeEntry(TE.getMainOp());
1059310627
E && E->getVectorFactor() == TE.getVectorFactor() &&
@@ -12956,10 +12990,7 @@ Value *BoUpSLP::vectorizeTree(TreeEntry *E, bool PostponedPHIs) {
1295612990
return ShuffleBuilder.finalize(E->ReuseShuffleIndices);
1295712991
};
1295812992

12959-
assert((E->State == TreeEntry::Vectorize ||
12960-
E->State == TreeEntry::ScatterVectorize ||
12961-
E->State == TreeEntry::StridedVectorize) &&
12962-
"Unhandled state");
12993+
assert(!E->isGather() && "Unhandled state");
1296312994
unsigned ShuffleOrOp =
1296412995
E->isAltShuffle() ? (unsigned)Instruction::ShuffleVector : E->getOpcode();
1296512996
Instruction *VL0 = E->getMainOp();

0 commit comments

Comments
 (0)