Skip to content

[SLP][NFC]Introduce CombinedVectorize nodes, NFC. #99309

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
139 changes: 85 additions & 54 deletions llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -2953,13 +2953,24 @@ class BoUpSLP {
/// (either with vector instruction or with scatter/gather
/// intrinsics for store/load)?
enum EntryState {
Vectorize,
ScatterVectorize,
StridedVectorize,
NeedToGather
Vectorize, ///< The node is regularly vectorized.
ScatterVectorize, ///< Masked scatter/gather node.
StridedVectorize, ///< Strided loads (and stores)
NeedToGather, ///< Gather/buildvector node.
CombinedVectorize, ///< Vectorized node, combined with its user into more
///< complex node like select/cmp to minmax, mul/add to
///< fma, etc. Must be used for the following nodes in
///< the pattern, not the very first one.
};
EntryState State;

/// List of combined opcodes supported by the vectorizer.
enum CombinedOpcode {
NotCombinedOp = -1,
MinMax = Instruction::OtherOpsEnd + 1,
};
CombinedOpcode CombinedOp = NotCombinedOp;

/// Does this sequence require some shuffling?
SmallVector<int, 4> ReuseShuffleIndices;

Expand Down Expand Up @@ -3147,6 +3158,9 @@ class BoUpSLP {
case NeedToGather:
dbgs() << "NeedToGather\n";
break;
case CombinedVectorize:
dbgs() << "CombinedVectorize\n";
break;
}
dbgs() << "MainOp: ";
if (MainOp)
Expand Down Expand Up @@ -7194,6 +7208,7 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth,
buildTree_rec(PointerOps, Depth + 1, {TE, 0});
LLVM_DEBUG(dbgs() << "SLP: added a vector of non-consecutive loads.\n");
break;
case TreeEntry::CombinedVectorize:
case TreeEntry::NeedToGather:
llvm_unreachable("Unexpected loads state.");
}
Expand Down Expand Up @@ -8249,6 +8264,22 @@ void BoUpSLP::transformNodes() {
}
break;
}
case Instruction::Select: {
if (E.State != TreeEntry::Vectorize)
break;
auto [MinMaxID, SelectOnly] = canConvertToMinOrMaxIntrinsic(E.Scalars);
if (MinMaxID == Intrinsic::not_intrinsic)
break;
// This node is a minmax node.
E.CombinedOp = TreeEntry::MinMax;
TreeEntry *CondEntry = const_cast<TreeEntry *>(getOperandEntry(&E, 0));
if (SelectOnly && CondEntry->UserTreeIndices.size() == 1 &&
CondEntry->State == TreeEntry::Vectorize) {
// The condition node is part of the combined minmax node.
CondEntry->State = TreeEntry::CombinedVectorize;
}
break;
}
default:
break;
}
Expand Down Expand Up @@ -9362,6 +9393,8 @@ BoUpSLP::getEntryCost(const TreeEntry *E, ArrayRef<Value *> VectorizedVals,
Instruction *VL0 = E->getMainOp();
unsigned ShuffleOrOp =
E->isAltShuffle() ? (unsigned)Instruction::ShuffleVector : E->getOpcode();
if (E->CombinedOp != TreeEntry::NotCombinedOp)
ShuffleOrOp = E->CombinedOp;
SetVector<Value *> UniqueValues(VL.begin(), VL.end());
const unsigned Sz = UniqueValues.size();
SmallBitVector UsedScalars(Sz, false);
Expand Down Expand Up @@ -9447,6 +9480,31 @@ BoUpSLP::getEntryCost(const TreeEntry *E, ArrayRef<Value *> VectorizedVals,
return VecCost - ScalarCost;
};

auto GetMinMaxCost = [&](Type *Ty, Instruction *VI = nullptr) {
auto [MinMaxID, SelectOnly] = canConvertToMinOrMaxIntrinsic(VI ? VI : VL);
if (MinMaxID == Intrinsic::not_intrinsic)
return InstructionCost::getInvalid();
Type *CanonicalType = Ty;
if (CanonicalType->isPtrOrPtrVectorTy())
CanonicalType = CanonicalType->getWithNewType(IntegerType::get(
CanonicalType->getContext(),
DL->getTypeSizeInBits(CanonicalType->getScalarType())));

IntrinsicCostAttributes CostAttrs(MinMaxID, CanonicalType,
{CanonicalType, CanonicalType});
InstructionCost IntrinsicCost =
TTI->getIntrinsicInstrCost(CostAttrs, CostKind);
// If the selects are the only uses of the compares, they will be
// dead and we can adjust the cost by removing their cost.
if (VI && SelectOnly) {
assert(!Ty->isVectorTy() && "Expected only for scalar type.");
auto *CI = cast<CmpInst>(VI->getOperand(0));
IntrinsicCost -=
TTI->getCmpSelInstrCost(CI->getOpcode(), Ty, Builder.getInt1Ty(),
CI->getPredicate(), CostKind, CI);
}
return IntrinsicCost;
};
switch (ShuffleOrOp) {
case Instruction::PHI: {
// Count reused scalars.
Expand Down Expand Up @@ -9707,28 +9765,9 @@ BoUpSLP::getEntryCost(const TreeEntry *E, ArrayRef<Value *> VectorizedVals,
InstructionCost ScalarCost = TTI->getCmpSelInstrCost(
E->getOpcode(), OrigScalarTy, Builder.getInt1Ty(), CurrentPred,
CostKind, VI);
auto [MinMaxID, SelectOnly] = canConvertToMinOrMaxIntrinsic(VI);
if (MinMaxID != Intrinsic::not_intrinsic) {
Type *CanonicalType = OrigScalarTy;
if (CanonicalType->isPtrOrPtrVectorTy())
CanonicalType = CanonicalType->getWithNewType(IntegerType::get(
CanonicalType->getContext(),
DL->getTypeSizeInBits(CanonicalType->getScalarType())));

IntrinsicCostAttributes CostAttrs(MinMaxID, CanonicalType,
{CanonicalType, CanonicalType});
InstructionCost IntrinsicCost =
TTI->getIntrinsicInstrCost(CostAttrs, CostKind);
// If the selects are the only uses of the compares, they will be
// dead and we can adjust the cost by removing their cost.
if (SelectOnly) {
auto *CI = cast<CmpInst>(VI->getOperand(0));
IntrinsicCost -= TTI->getCmpSelInstrCost(
CI->getOpcode(), OrigScalarTy, Builder.getInt1Ty(),
CI->getPredicate(), CostKind, CI);
}
ScalarCost = std::min(ScalarCost, IntrinsicCost);
}
InstructionCost IntrinsicCost = GetMinMaxCost(OrigScalarTy, VI);
if (IntrinsicCost.isValid())
ScalarCost = IntrinsicCost;

return ScalarCost;
};
Expand All @@ -9737,30 +9776,6 @@ BoUpSLP::getEntryCost(const TreeEntry *E, ArrayRef<Value *> VectorizedVals,

InstructionCost VecCost = TTI->getCmpSelInstrCost(
E->getOpcode(), VecTy, MaskTy, VecPred, CostKind, VL0);
// Check if it is possible and profitable to use min/max for selects
// in VL.
//
auto [MinMaxID, SelectOnly] = canConvertToMinOrMaxIntrinsic(VL);
if (MinMaxID != Intrinsic::not_intrinsic) {
Type *CanonicalType = VecTy;
if (CanonicalType->isPtrOrPtrVectorTy())
CanonicalType = CanonicalType->getWithNewType(IntegerType::get(
CanonicalType->getContext(),
DL->getTypeSizeInBits(CanonicalType->getScalarType())));
IntrinsicCostAttributes CostAttrs(MinMaxID, CanonicalType,
{CanonicalType, CanonicalType});
InstructionCost IntrinsicCost =
TTI->getIntrinsicInstrCost(CostAttrs, CostKind);
// If the selects are the only uses of the compares, they will be
// dead and we can adjust the cost by removing their cost.
if (SelectOnly) {
auto *CI =
cast<CmpInst>(cast<Instruction>(VL.front())->getOperand(0));
IntrinsicCost -= TTI->getCmpSelInstrCost(CI->getOpcode(), VecTy,
MaskTy, VecPred, CostKind);
}
VecCost = std::min(VecCost, IntrinsicCost);
}
if (auto *SI = dyn_cast<SelectInst>(VL0)) {
auto *CondType =
getWidenedType(SI->getCondition()->getType(), VL.size());
Expand All @@ -9782,6 +9797,16 @@ BoUpSLP::getEntryCost(const TreeEntry *E, ArrayRef<Value *> VectorizedVals,
};
return GetCostDiff(GetScalarCost, GetVectorCost);
}
case TreeEntry::MinMax: {
auto GetScalarCost = [&](unsigned Idx) {
return GetMinMaxCost(OrigScalarTy);
};
auto GetVectorCost = [&](InstructionCost CommonCost) {
InstructionCost VecCost = GetMinMaxCost(VecTy);
return VecCost + CommonCost;
};
return GetCostDiff(GetScalarCost, GetVectorCost);
}
case Instruction::FNeg:
case Instruction::Add:
case Instruction::FAdd:
Expand Down Expand Up @@ -10518,6 +10543,15 @@ InstructionCost BoUpSLP::getTreeCost(ArrayRef<Value *> VectorizedVals) {
SmallPtrSet<Value *, 4> CheckedExtracts;
for (unsigned I = 0, E = VectorizableTree.size(); I < E; ++I) {
TreeEntry &TE = *VectorizableTree[I];
// No need to count the cost for combined entries, they are combined and
// just skip their cost.
if (TE.State == TreeEntry::CombinedVectorize) {
LLVM_DEBUG(
dbgs() << "SLP: Skipping cost for combined node that starts with "
<< *TE.Scalars[0] << ".\n";
TE.dump(); dbgs() << "SLP: Current total cost = " << Cost << "\n");
continue;
}
if (TE.isGather()) {
if (const TreeEntry *E = getTreeEntry(TE.getMainOp());
E && E->getVectorFactor() == TE.getVectorFactor() &&
Expand Down Expand Up @@ -12864,10 +12898,7 @@ Value *BoUpSLP::vectorizeTree(TreeEntry *E, bool PostponedPHIs) {
return ShuffleBuilder.finalize(E->ReuseShuffleIndices);
};

assert((E->State == TreeEntry::Vectorize ||
E->State == TreeEntry::ScatterVectorize ||
E->State == TreeEntry::StridedVectorize) &&
"Unhandled state");
assert(!E->isGather() && "Unhandled state");
unsigned ShuffleOrOp =
E->isAltShuffle() ? (unsigned)Instruction::ShuffleVector : E->getOpcode();
Instruction *VL0 = E->getMainOp();
Expand Down
Loading