Skip to content

Commit 381e991

Browse files
committed
[VectorCombine] foldShuffleOfBinops - fold shuffle(binop(shuffle(x),shuffle(z)),binop(shuffle(y),shuffle(w)) -> binop(shuffle(x,z),shuffle(y,w))
Some patterns (in particular horizontal style patterns) can end up with shuffles straddling both sides of a binop/cmp. Where individually the folds aren't worth it, by merging the (oneuse) shuffles we can notably reduce the net instruction count and cost. One of the final steps towards finally addressing #34072
1 parent 35dc654 commit 381e991

File tree

4 files changed

+95
-146
lines changed

4 files changed

+95
-146
lines changed

llvm/lib/Transforms/Vectorize/VectorCombine.cpp

Lines changed: 32 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1732,6 +1732,36 @@ bool VectorCombine::foldShuffleOfBinops(Instruction &I) {
17321732
TTI.getShuffleCost(TargetTransformInfo::SK_PermuteTwoSrc, BinResTy,
17331733
OldMask, CostKind, 0, nullptr, {LHS, RHS}, &I);
17341734

1735+
// Handle shuffle(binop(shuffle(x),y),binop(z,shuffle(w))) style patterns
1736+
// where one use shuffles have gotten split across the binop/cmp. These
1737+
// often allow a major reduction in total cost that wouldn't happen as
1738+
// individual folds.
1739+
auto MergeInner = [&](Value *&Op, int Offset, MutableArrayRef<int> Mask,
1740+
TTI::TargetCostKind CostKind) -> bool {
1741+
Value *InnerOp;
1742+
ArrayRef<int> InnerMask;
1743+
if (match(Op, m_OneUse(m_Shuffle(m_Value(InnerOp), m_Undef(),
1744+
m_Mask(InnerMask)))) &&
1745+
all_of(InnerMask,
1746+
[NumSrcElts](int M) { return M < (int)NumSrcElts; }) &&
1747+
InnerOp->getType() == Op->getType()) {
1748+
for (int &M : Mask)
1749+
if (Offset <= M && M < (int)(Offset + NumSrcElts)) {
1750+
M = InnerMask[M - Offset];
1751+
M = 0 <= M ? M + Offset : M;
1752+
}
1753+
OldCost += TTI.getInstructionCost(cast<Instruction>(Op), CostKind);
1754+
Op = InnerOp;
1755+
return true;
1756+
}
1757+
return false;
1758+
};
1759+
bool ReducedInstCount = false;
1760+
ReducedInstCount |= MergeInner(X, 0, NewMask0, CostKind);
1761+
ReducedInstCount |= MergeInner(Y, 0, NewMask1, CostKind);
1762+
ReducedInstCount |= MergeInner(Z, NumSrcElts, NewMask0, CostKind);
1763+
ReducedInstCount |= MergeInner(W, NumSrcElts, NewMask1, CostKind);
1764+
17351765
InstructionCost NewCost =
17361766
TTI.getShuffleCost(SK0, BinOpTy, NewMask0, CostKind, 0, nullptr, {X, Z}) +
17371767
TTI.getShuffleCost(SK1, BinOpTy, NewMask1, CostKind, 0, nullptr, {Y, W});
@@ -1752,8 +1782,8 @@ bool VectorCombine::foldShuffleOfBinops(Instruction &I) {
17521782

17531783
// If either shuffle will constant fold away, then fold for the same cost as
17541784
// we will reduce the instruction count.
1755-
bool ReducedInstCount = (isa<Constant>(X) && isa<Constant>(Z)) ||
1756-
(isa<Constant>(Y) && isa<Constant>(W));
1785+
ReducedInstCount |= (isa<Constant>(X) && isa<Constant>(Z)) ||
1786+
(isa<Constant>(Y) && isa<Constant>(W));
17571787
if (ReducedInstCount ? (NewCost > OldCost) : (NewCost >= OldCost))
17581788
return false;
17591789

0 commit comments

Comments
 (0)