Skip to content

Commit e3ec5a7

Browse files
authored
[VectorCombine] foldShuffleOfBinops - fold shuffle(binop(shuffle(x),shuffle(z)),binop(shuffle(y),shuffle(w)) -> binop(shuffle(x,z),shuffle(y,w)) (#120984)
Some patterns (in particular horizontal style patterns) can end up with shuffles straddling both sides of a binop/cmp. Where individually the folds aren't worth it, by merging the (oneuse) shuffles we can notably reduce the net instruction count and cost. One of the final steps towards finally addressing #34072
1 parent cdad183 commit e3ec5a7

File tree

4 files changed

+95
-146
lines changed

4 files changed

+95
-146
lines changed

llvm/lib/Transforms/Vectorize/VectorCombine.cpp

Lines changed: 32 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1743,6 +1743,36 @@ bool VectorCombine::foldShuffleOfBinops(Instruction &I) {
17431743
TTI.getShuffleCost(TargetTransformInfo::SK_PermuteTwoSrc, BinResTy,
17441744
OldMask, CostKind, 0, nullptr, {LHS, RHS}, &I);
17451745

1746+
// Handle shuffle(binop(shuffle(x),y),binop(z,shuffle(w))) style patterns
1747+
// where one use shuffles have gotten split across the binop/cmp. These
1748+
// often allow a major reduction in total cost that wouldn't happen as
1749+
// individual folds.
1750+
auto MergeInner = [&](Value *&Op, int Offset, MutableArrayRef<int> Mask,
1751+
TTI::TargetCostKind CostKind) -> bool {
1752+
Value *InnerOp;
1753+
ArrayRef<int> InnerMask;
1754+
if (match(Op, m_OneUse(m_Shuffle(m_Value(InnerOp), m_Undef(),
1755+
m_Mask(InnerMask)))) &&
1756+
InnerOp->getType() == Op->getType() &&
1757+
all_of(InnerMask,
1758+
[NumSrcElts](int M) { return M < (int)NumSrcElts; })) {
1759+
for (int &M : Mask)
1760+
if (Offset <= M && M < (int)(Offset + NumSrcElts)) {
1761+
M = InnerMask[M - Offset];
1762+
M = 0 <= M ? M + Offset : M;
1763+
}
1764+
OldCost += TTI.getInstructionCost(cast<Instruction>(Op), CostKind);
1765+
Op = InnerOp;
1766+
return true;
1767+
}
1768+
return false;
1769+
};
1770+
bool ReducedInstCount = false;
1771+
ReducedInstCount |= MergeInner(X, 0, NewMask0, CostKind);
1772+
ReducedInstCount |= MergeInner(Y, 0, NewMask1, CostKind);
1773+
ReducedInstCount |= MergeInner(Z, NumSrcElts, NewMask0, CostKind);
1774+
ReducedInstCount |= MergeInner(W, NumSrcElts, NewMask1, CostKind);
1775+
17461776
InstructionCost NewCost =
17471777
TTI.getShuffleCost(SK0, BinOpTy, NewMask0, CostKind, 0, nullptr, {X, Z}) +
17481778
TTI.getShuffleCost(SK1, BinOpTy, NewMask1, CostKind, 0, nullptr, {Y, W});
@@ -1763,8 +1793,8 @@ bool VectorCombine::foldShuffleOfBinops(Instruction &I) {
17631793

17641794
// If either shuffle will constant fold away, then fold for the same cost as
17651795
// we will reduce the instruction count.
1766-
bool ReducedInstCount = (isa<Constant>(X) && isa<Constant>(Z)) ||
1767-
(isa<Constant>(Y) && isa<Constant>(W));
1796+
ReducedInstCount |= (isa<Constant>(X) && isa<Constant>(Z)) ||
1797+
(isa<Constant>(Y) && isa<Constant>(W));
17681798
if (ReducedInstCount ? (NewCost > OldCost) : (NewCost >= OldCost))
17691799
return false;
17701800

0 commit comments

Comments
 (0)