Skip to content

[SLP] NFC. Replace TreeEntry::setOperandsInOrder with VLOperands. #113880

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 1 commit into from
Dec 6, 2024
Merged
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
207 changes: 65 additions & 142 deletions llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -2017,6 +2017,9 @@ class BoUpSLP {

/// A vector of operand vectors.
SmallVector<OperandDataVec, 4> OpsVec;
/// When VL[0] is IntrinsicInst, ArgSize is CallBase::arg_size. When VL[0]
/// is not IntrinsicInst, ArgSize is User::getNumOperands.
unsigned ArgSize = 0;

const TargetLibraryInfo &TLI;
const DataLayout &DL;
Expand Down Expand Up @@ -2404,10 +2407,12 @@ class BoUpSLP {
assert(!VL.empty() && "Bad VL");
assert((empty() || VL.size() == getNumLanes()) &&
"Expected same number of lanes");
// IntrinsicInst::isCommutative returns true if swapping the first "two"
// arguments to the intrinsic produces the same result.
constexpr unsigned IntrinsicNumOperands = 2;
auto *VL0 = cast<Instruction>(*find_if(VL, IsaPred<Instruction>));
unsigned NumOperands = isa<IntrinsicInst>(VL0) ? IntrinsicNumOperands
: VL0->getNumOperands();
unsigned NumOperands = VL0->getNumOperands();
ArgSize = isa<IntrinsicInst>(VL0) ? IntrinsicNumOperands : NumOperands;
OpsVec.resize(NumOperands);
unsigned NumLanes = VL.size();
for (unsigned OpIdx = 0; OpIdx != NumOperands; ++OpIdx) {
Expand Down Expand Up @@ -2440,7 +2445,7 @@ class BoUpSLP {
}

/// \returns the number of operands.
unsigned getNumOperands() const { return OpsVec.size(); }
unsigned getNumOperands() const { return ArgSize; }

/// \returns the number of lanes.
unsigned getNumLanes() const { return OpsVec[0].size(); }
Expand Down Expand Up @@ -2617,7 +2622,8 @@ class BoUpSLP {
ArrayRef<OperandData> Op0 = OpsVec.front();
for (const OperandData &Data : Op0)
UniqueValues.insert(Data.V);
for (ArrayRef<OperandData> Op : drop_begin(OpsVec, 1)) {
for (ArrayRef<OperandData> Op :
ArrayRef(OpsVec).slice(1, getNumOperands() - 1)) {
if (any_of(Op, [&UniqueValues](const OperandData &Data) {
return !UniqueValues.contains(Data.V);
}))
Expand Down Expand Up @@ -3138,13 +3144,6 @@ class BoUpSLP {
SmallVector<SmallVector<std::pair<LoadInst *, int>>>,
8> &GatheredLoads);

/// Reorder commutative or alt operands to get better probability of
/// generating vectorized code.
static void reorderInputsAccordingToOpcode(ArrayRef<Value *> VL,
SmallVectorImpl<Value *> &Left,
SmallVectorImpl<Value *> &Right,
const BoUpSLP &R);

/// Helper for `findExternalStoreUsersReorderIndices()`. It iterates over the
/// users of \p TE and collects the stores. It returns the map from the store
/// pointers to the collected stores.
Expand Down Expand Up @@ -3339,27 +3338,15 @@ class BoUpSLP {
copy(OpVL, Operands[OpIdx].begin());
}

/// Set the operands of this bundle in their original order.
void setOperandsInOrder() {
assert(Operands.empty() && "Already initialized?");
auto *I0 = cast<Instruction>(*find_if(Scalars, IsaPred<Instruction>));
Operands.resize(I0->getNumOperands());
unsigned NumLanes = Scalars.size();
for (unsigned OpIdx = 0, NumOperands = I0->getNumOperands();
OpIdx != NumOperands; ++OpIdx) {
Operands[OpIdx].resize(NumLanes);
for (unsigned Lane = 0; Lane != NumLanes; ++Lane) {
if (isa<PoisonValue>(Scalars[Lane])) {
Operands[OpIdx][Lane] =
PoisonValue::get(I0->getOperand(OpIdx)->getType());
continue;
}
auto *I = cast<Instruction>(Scalars[Lane]);
assert(I->getNumOperands() == NumOperands &&
"Expected same number of operands");
Operands[OpIdx][Lane] = I->getOperand(OpIdx);
}
}
/// Set this bundle's operand from \p VL.
void setOperand(ArrayRef<Value *> VL, const BoUpSLP &R,
bool RequireReorder = false) {
VLOperands Ops(VL, R);
if (RequireReorder)
Ops.reorder();
for (unsigned I :
seq<unsigned>(cast<Instruction>(VL[0])->getNumOperands()))
setOperand(I, Ops.getVL(I));
}

/// Reorders operands of the node to the given mask \p Mask.
Expand Down Expand Up @@ -8459,7 +8446,7 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth,
{}, CurrentOrder);
LLVM_DEBUG(dbgs() << "SLP: added inserts bundle.\n");

TE->setOperandsInOrder();
TE->setOperand(VL, *this);
buildTree_rec(TE->getOperand(1), Depth + 1, {TE, 1});
return;
}
Expand All @@ -8480,27 +8467,26 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth,
LLVM_DEBUG(dbgs() << "SLP: added a vector of loads.\n");
else
LLVM_DEBUG(dbgs() << "SLP: added a vector of jumbled loads.\n");
TE->setOperandsInOrder();
break;
case TreeEntry::StridedVectorize:
// Vectorizing non-consecutive loads with `llvm.masked.gather`.
TE = newTreeEntry(VL, TreeEntry::StridedVectorize, Bundle, S,
UserTreeIdx, ReuseShuffleIndices, CurrentOrder);
TE->setOperandsInOrder();
LLVM_DEBUG(dbgs() << "SLP: added a vector of strided loads.\n");
break;
case TreeEntry::ScatterVectorize:
// Vectorizing non-consecutive loads with `llvm.masked.gather`.
TE = newTreeEntry(VL, TreeEntry::ScatterVectorize, Bundle, S,
UserTreeIdx, ReuseShuffleIndices);
TE->setOperandsInOrder();
buildTree_rec(PointerOps, Depth + 1, {TE, 0});
LLVM_DEBUG(dbgs() << "SLP: added a vector of non-consecutive loads.\n");
break;
case TreeEntry::CombinedVectorize:
case TreeEntry::NeedToGather:
llvm_unreachable("Unexpected loads state.");
}
TE->setOperand(VL, *this);
if (State == TreeEntry::ScatterVectorize)
buildTree_rec(PointerOps, Depth + 1, {TE, 0});
return;
}
case Instruction::ZExt:
Expand Down Expand Up @@ -8538,8 +8524,8 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth,
ReuseShuffleIndices);
LLVM_DEBUG(dbgs() << "SLP: added a vector of casts.\n");

TE->setOperandsInOrder();
for (unsigned I : seq<unsigned>(0, VL0->getNumOperands()))
TE->setOperand(VL, *this);
for (unsigned I : seq<unsigned>(VL0->getNumOperands()))
buildTree_rec(TE->getOperand(I), Depth + 1, {TE, I});
if (ShuffleOrOp == Instruction::Trunc) {
ExtraBitWidthNodes.insert(getOperandEntry(TE, 0)->Idx);
Expand All @@ -8566,12 +8552,15 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth,
LLVM_DEBUG(dbgs() << "SLP: added a vector of compares.\n");

ValueList Left, Right;
VLOperands Ops(VL, *this);
if (cast<CmpInst>(VL0)->isCommutative()) {
// Commutative predicate - collect + sort operands of the instructions
// so that each side is more likely to have the same opcode.
assert(P0 == CmpInst::getSwappedPredicate(P0) &&
"Commutative Predicate mismatch");
reorderInputsAccordingToOpcode(VL, Left, Right, *this);
Ops.reorder();
Left = Ops.getVL(0);
Right = Ops.getVL(1);
} else {
// Collect operands - commute if it uses the swapped predicate.
for (Value *V : VL) {
Expand Down Expand Up @@ -8632,20 +8621,8 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth,
ReuseShuffleIndices);
LLVM_DEBUG(dbgs() << "SLP: added a vector of un/bin op.\n");

// Sort operands of the instructions so that each side is more likely to
// have the same opcode.
if (isa<BinaryOperator>(VL0) && isCommutative(VL0)) {
ValueList Left, Right;
reorderInputsAccordingToOpcode(VL, Left, Right, *this);
TE->setOperand(0, Left);
TE->setOperand(1, Right);
buildTree_rec(Left, Depth + 1, {TE, 0});
buildTree_rec(Right, Depth + 1, {TE, 1});
return;
}

TE->setOperandsInOrder();
for (unsigned I : seq<unsigned>(0, VL0->getNumOperands()))
TE->setOperand(VL, *this, isa<BinaryOperator>(VL0) && isCommutative(VL0));
for (unsigned I : seq<unsigned>(VL0->getNumOperands()))
buildTree_rec(TE->getOperand(I), Depth + 1, {TE, I});
return;
}
Expand Down Expand Up @@ -8710,7 +8687,7 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth,
fixupOrderingIndices(CurrentOrder);
TreeEntry *TE = newTreeEntry(VL, Bundle /*vectorized*/, S, UserTreeIdx,
ReuseShuffleIndices, CurrentOrder);
TE->setOperandsInOrder();
TE->setOperand(VL, *this);
buildTree_rec(TE->getOperand(0), Depth + 1, {TE, 0});
if (Consecutive)
LLVM_DEBUG(dbgs() << "SLP: added a vector of stores.\n");
Expand All @@ -8726,46 +8703,13 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth,

TreeEntry *TE = newTreeEntry(VL, Bundle /*vectorized*/, S, UserTreeIdx,
ReuseShuffleIndices);
// Sort operands of the instructions so that each side is more likely to
// have the same opcode.
if (isCommutative(VL0)) {
ValueList Left, Right;
reorderInputsAccordingToOpcode(VL, Left, Right, *this);
TE->setOperand(0, Left);
TE->setOperand(1, Right);
SmallVector<ValueList> Operands;
for (unsigned I : seq<unsigned>(2, CI->arg_size())) {
Operands.emplace_back();
if (isVectorIntrinsicWithScalarOpAtArg(ID, I))
continue;
Comment on lines -8739 to -8740
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

There is no such check in the new code in setOperand. What if one of the 2 first operands is isVectorIntrinsicWithScalarOpAtArg()?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

for (unsigned I : seq<unsigned>(2, CI->arg_size())) {
It starts at 2.

The only difference is the new code will store it.
The old code for other arguments which is isVectorIntrinsicWithScalarOpAtArg, setOperand will not store it.
The new code for other arguments which is isVectorIntrinsicWithScalarOpAtArg, setOperand will store it.

But both the old code and the new code will not execute buildTree_rec if the argument is isVectorIntrinsicWithScalarOpAtArg.

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Better to avoid calling setOperand for isVectorIntrinsicWithScalarOpAtArg args

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

But for IntrinsicInst which is NOT commutative, all arguments will be stored into a TreeEntry (and SLP will not call buildTree_rec for those arguments).

If we have to "avoid calling setOperand for isVectorIntrinsicWithScalarOpAtArg args" for commutative IntrinsicInst, we have to do something like the following patch.

diff --git a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
index d8c596eb875d..54210cc0b149 100644
--- a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
+++ b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
@@ -3267,8 +3267,18 @@ private:
     void setOperand(ArrayRef<Value *> VL, const BoUpSLP &R,
                     bool RequireReorder = false) {
       VLOperands Ops(VL, R);
-      if (RequireReorder)
+      if (RequireReorder) {
         Ops.reorder();
+        if (auto *CI = dyn_cast<CallInst>(VL[0])) {
+          Intrinsic::ID ID = getVectorIntrinsicIDForCall(CI, R.TLI);
+          for (unsigned I : seq<unsigned>(CI->arg_size())) {
+            if (isVectorIntrinsicWithScalarOpAtArg(ID, I))
+              continue;
+            setOperand(I, Ops.getVL(I));
+          }
+          return;
+        }
+      }
       for (unsigned I :
            seq<unsigned>(cast<Instruction>(VL[0])->getNumOperands()))
         setOperand(I, Ops.getVL(I));

But it does not look elegant.

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

But for IntrinsicInst which is NOT commutative, all arguments will be stored into a TreeEntry (and SLP will not call buildTree_rec for those arguments).

If we have to "avoid calling setOperand for isVectorIntrinsicWithScalarOpAtArg args" for commutative IntrinsicInst, we have to do something like the following patch.

diff --git a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
index d8c596eb875d..54210cc0b149 100644
--- a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
+++ b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
@@ -3267,8 +3267,18 @@ private:
     void setOperand(ArrayRef<Value *> VL, const BoUpSLP &R,
                     bool RequireReorder = false) {
       VLOperands Ops(VL, R);
-      if (RequireReorder)
+      if (RequireReorder) {
         Ops.reorder();
+        if (auto *CI = dyn_cast<CallInst>(VL[0])) {
+          Intrinsic::ID ID = getVectorIntrinsicIDForCall(CI, R.TLI);
+          for (unsigned I : seq<unsigned>(CI->arg_size())) {
+            if (isVectorIntrinsicWithScalarOpAtArg(ID, I))
+              continue;
+            setOperand(I, Ops.getVL(I));
+          }
+          return;
+        }
+      }
       for (unsigned I :
            seq<unsigned>(cast<Instruction>(VL[0])->getNumOperands()))
         setOperand(I, Ops.getVL(I));

But it does not look elegant.

We should maintain correctness where possible and should not produce "noise" entries in the graph. They may cause unexpecteв side effects and cause some extra compiler crashes

for (Value *V : VL) {
auto *CI2 = cast<CallInst>(V);
Operands.back().push_back(CI2->getArgOperand(I));
}
TE->setOperand(I, Operands.back());
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

What about these operands, >= 2?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

What do you mean? They are still in TE. appendOperandsOfVL did not forget them.

}
buildTree_rec(Left, Depth + 1, {TE, 0});
buildTree_rec(Right, Depth + 1, {TE, 1});
for (unsigned I : seq<unsigned>(2, CI->arg_size())) {
if (Operands[I - 2].empty())
continue;
buildTree_rec(Operands[I - 2], Depth + 1, {TE, I});
}
return;
}
TE->setOperandsInOrder();
for (unsigned I : seq<unsigned>(0, CI->arg_size())) {
TE->setOperand(VL, *this, isCommutative(VL0));
for (unsigned I : seq<unsigned>(CI->arg_size())) {
// For scalar operands no need to create an entry since no need to
// vectorize it.
if (isVectorIntrinsicWithScalarOpAtArg(ID, I))
continue;
ValueList Operands;
// Prepare the operand vector.
for (Value *V : VL) {
auto *CI2 = cast<CallInst>(V);
Operands.push_back(CI2->getArgOperand(I));
}
buildTree_rec(Operands, Depth + 1, {TE, I});
buildTree_rec(TE->getOperand(I), Depth + 1, {TE, I});
}
return;
}
Expand All @@ -8776,43 +8720,37 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth,

// Reorder operands if reordering would enable vectorization.
auto *CI = dyn_cast<CmpInst>(VL0);
if (isa<BinaryOperator>(VL0) || CI) {
if (CI && any_of(VL, [](Value *V) {
return !isa<PoisonValue>(V) && !cast<CmpInst>(V)->isCommutative();
})) {
auto *MainCI = cast<CmpInst>(S.MainOp);
auto *AltCI = cast<CmpInst>(S.AltOp);
CmpInst::Predicate MainP = MainCI->getPredicate();
CmpInst::Predicate AltP = AltCI->getPredicate();
assert(MainP != AltP &&
"Expected different main/alternate predicates.");
ValueList Left, Right;
if (!CI || all_of(VL, [](Value *V) {
return isa<PoisonValue>(V) || cast<CmpInst>(V)->isCommutative();
})) {
reorderInputsAccordingToOpcode(VL, Left, Right, *this);
} else {
auto *MainCI = cast<CmpInst>(S.MainOp);
auto *AltCI = cast<CmpInst>(S.AltOp);
CmpInst::Predicate MainP = MainCI->getPredicate();
CmpInst::Predicate AltP = AltCI->getPredicate();
assert(MainP != AltP &&
"Expected different main/alternate predicates.");
// Collect operands - commute if it uses the swapped predicate or
// alternate operation.
for (Value *V : VL) {
if (isa<PoisonValue>(V)) {
Left.push_back(
PoisonValue::get(MainCI->getOperand(0)->getType()));
Right.push_back(
PoisonValue::get(MainCI->getOperand(1)->getType()));
continue;
}
auto *Cmp = cast<CmpInst>(V);
Value *LHS = Cmp->getOperand(0);
Value *RHS = Cmp->getOperand(1);
// Collect operands - commute if it uses the swapped predicate or
// alternate operation.
for (Value *V : VL) {
if (isa<PoisonValue>(V)) {
Left.push_back(PoisonValue::get(MainCI->getOperand(0)->getType()));
Right.push_back(PoisonValue::get(MainCI->getOperand(1)->getType()));
continue;
}
auto *Cmp = cast<CmpInst>(V);
Value *LHS = Cmp->getOperand(0);
Value *RHS = Cmp->getOperand(1);

if (isAlternateInstruction(Cmp, MainCI, AltCI, *TLI)) {
if (AltP == CmpInst::getSwappedPredicate(Cmp->getPredicate()))
std::swap(LHS, RHS);
} else {
if (MainP == CmpInst::getSwappedPredicate(Cmp->getPredicate()))
std::swap(LHS, RHS);
}
Left.push_back(LHS);
Right.push_back(RHS);
if (isAlternateInstruction(Cmp, MainCI, AltCI, *TLI)) {
if (AltP == CmpInst::getSwappedPredicate(Cmp->getPredicate()))
std::swap(LHS, RHS);
} else {
if (MainP == CmpInst::getSwappedPredicate(Cmp->getPredicate()))
std::swap(LHS, RHS);
}
Left.push_back(LHS);
Right.push_back(RHS);
}
TE->setOperand(0, Left);
TE->setOperand(1, Right);
Expand All @@ -8821,8 +8759,8 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth,
return;
}

TE->setOperandsInOrder();
for (unsigned I : seq<unsigned>(0, VL0->getNumOperands()))
TE->setOperand(VL, *this, isa<BinaryOperator>(VL0) || CI);
for (unsigned I : seq<unsigned>(VL0->getNumOperands()))
buildTree_rec(TE->getOperand(I), Depth + 1, {TE, I});
return;
}
Expand Down Expand Up @@ -13526,21 +13464,6 @@ InstructionCost BoUpSLP::getGatherCost(ArrayRef<Value *> VL, bool ForPoisonSrc,
return Cost;
}

// Perform operand reordering on the instructions in VL and return the reordered
// operands in Left and Right.
void BoUpSLP::reorderInputsAccordingToOpcode(ArrayRef<Value *> VL,
SmallVectorImpl<Value *> &Left,
SmallVectorImpl<Value *> &Right,
const BoUpSLP &R) {
if (VL.empty())
return;
VLOperands Ops(VL, R);
// Reorder the operands in place.
Ops.reorder();
Left = Ops.getVL(0);
Right = Ops.getVL(1);
}

Instruction &BoUpSLP::getLastInstructionInBundle(const TreeEntry *E) {
auto &Res = EntryToLastInstruction.try_emplace(E).first->second;
if (Res)
Expand Down
Loading