Skip to content

Commit b5aaf9d

Browse files
authored
[InstCombine] Implement vp.reverse reordering/elimination through binop/unop (#143963)
This simply copies the structure of the vector.reverse patterns from just above, and reimplements them for the vp.reverse intrinsics when the mask is all ones and the EVLs exactly match. Its unfortunate that we have three different ways to represent a reverse (shuffle, vector.reverse, and vp.reverse) but I don't see an obvious way to remove any them because the semantics are slightly different. This significantly improves vectorization in TSVC_2's s112 and s1112 loops when using EVL tail folding.
1 parent 5d502ae commit b5aaf9d

File tree

3 files changed

+97
-23
lines changed

3 files changed

+97
-23
lines changed

llvm/lib/Transforms/InstCombine/InstCombineCalls.cpp

Lines changed: 19 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -3571,6 +3571,25 @@ Instruction *InstCombinerImpl::visitCallInst(CallInst &CI) {
35713571
}
35723572
break;
35733573
}
3574+
case Intrinsic::experimental_vp_reverse: {
3575+
Value *X;
3576+
Value *Vec = II->getArgOperand(0);
3577+
Value *Mask = II->getArgOperand(1);
3578+
if (!match(Mask, m_AllOnes()))
3579+
break;
3580+
Value *EVL = II->getArgOperand(2);
3581+
// rev(unop rev(X)) --> unop X
3582+
if (match(Vec,
3583+
m_OneUse(m_UnOp(m_Intrinsic<Intrinsic::experimental_vp_reverse>(
3584+
m_Value(X), m_AllOnes(), m_Specific(EVL)))))) {
3585+
auto *OldUnOp = cast<UnaryOperator>(Vec);
3586+
auto *NewUnOp = UnaryOperator::CreateWithCopiedFlags(
3587+
OldUnOp->getOpcode(), X, OldUnOp, OldUnOp->getName(),
3588+
II->getIterator());
3589+
return replaceInstUsesWith(CI, NewUnOp);
3590+
}
3591+
break;
3592+
}
35743593
case Intrinsic::vector_reduce_or:
35753594
case Intrinsic::vector_reduce_and: {
35763595
// Canonicalize logical or/and reductions:

llvm/lib/Transforms/InstCombine/InstructionCombining.cpp

Lines changed: 33 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -2231,6 +2231,39 @@ Instruction *InstCombinerImpl::foldVectorBinop(BinaryOperator &Inst) {
22312231
else if (isSplatValue(LHS) && match(RHS, m_OneUse(m_VecReverse(m_Value(V2)))))
22322232
return createBinOpReverse(LHS, V2);
22332233

2234+
auto createBinOpVPReverse = [&](Value *X, Value *Y, Value *EVL) {
2235+
Value *V = Builder.CreateBinOp(Opcode, X, Y, Inst.getName());
2236+
if (auto *BO = dyn_cast<BinaryOperator>(V))
2237+
BO->copyIRFlags(&Inst);
2238+
2239+
ElementCount EC = cast<VectorType>(V->getType())->getElementCount();
2240+
Value *AllTrueMask = Builder.CreateVectorSplat(EC, Builder.getTrue());
2241+
Module *M = Inst.getModule();
2242+
Function *F = Intrinsic::getOrInsertDeclaration(
2243+
M, Intrinsic::experimental_vp_reverse, V->getType());
2244+
return CallInst::Create(F, {V, AllTrueMask, EVL});
2245+
};
2246+
2247+
Value *EVL;
2248+
if (match(LHS, m_Intrinsic<Intrinsic::experimental_vp_reverse>(
2249+
m_Value(V1), m_AllOnes(), m_Value(EVL)))) {
2250+
// Op(rev(V1), rev(V2)) -> rev(Op(V1, V2))
2251+
if (match(RHS, m_Intrinsic<Intrinsic::experimental_vp_reverse>(
2252+
m_Value(V2), m_AllOnes(), m_Specific(EVL))) &&
2253+
(LHS->hasOneUse() || RHS->hasOneUse() ||
2254+
(LHS == RHS && LHS->hasNUses(2))))
2255+
return createBinOpVPReverse(V1, V2, EVL);
2256+
2257+
// Op(rev(V1), RHSSplat)) -> rev(Op(V1, RHSSplat))
2258+
if (LHS->hasOneUse() && isSplatValue(RHS))
2259+
return createBinOpVPReverse(V1, RHS, EVL);
2260+
}
2261+
// Op(LHSSplat, rev(V2)) -> rev(Op(LHSSplat, V2))
2262+
else if (isSplatValue(LHS) &&
2263+
match(RHS, m_Intrinsic<Intrinsic::experimental_vp_reverse>(
2264+
m_Value(V2), m_AllOnes(), m_Value(EVL))))
2265+
return createBinOpVPReverse(LHS, V2, EVL);
2266+
22342267
// It may not be safe to reorder shuffles and things like div, urem, etc.
22352268
// because we may trap when executing those ops on unknown vector elements.
22362269
// See PR20059.

llvm/test/Transforms/InstCombine/vp-reverse.ll

Lines changed: 45 additions & 23 deletions
Original file line numberDiff line numberDiff line change
@@ -3,11 +3,8 @@
33

44
define <vscale x 4 x i32> @binop_reverse_elim(<vscale x 4 x i32> %a, <vscale x 4 x i32> %b, i32 %evl) {
55
; CHECK-LABEL: @binop_reverse_elim(
6-
; CHECK-NEXT: [[A:%.*]] = tail call <vscale x 4 x i32> @llvm.experimental.vp.reverse.nxv4i32(<vscale x 4 x i32> [[A1:%.*]], <vscale x 4 x i1> splat (i1 true), i32 [[EVL:%.*]])
7-
; CHECK-NEXT: [[B:%.*]] = tail call <vscale x 4 x i32> @llvm.experimental.vp.reverse.nxv4i32(<vscale x 4 x i32> [[B1:%.*]], <vscale x 4 x i1> splat (i1 true), i32 [[EVL]])
8-
; CHECK-NEXT: [[ADD1:%.*]] = add nsw <vscale x 4 x i32> [[A]], [[B]]
9-
; CHECK-NEXT: [[ADD_REV:%.*]] = tail call <vscale x 4 x i32> @llvm.experimental.vp.reverse.nxv4i32(<vscale x 4 x i32> [[ADD1]], <vscale x 4 x i1> splat (i1 true), i32 [[EVL]])
10-
; CHECK-NEXT: ret <vscale x 4 x i32> [[ADD_REV]]
6+
; CHECK-NEXT: [[ADD1:%.*]] = add nsw <vscale x 4 x i32> [[A:%.*]], [[B:%.*]]
7+
; CHECK-NEXT: ret <vscale x 4 x i32> [[ADD1]]
118
;
129
%a.rev = tail call <vscale x 4 x i32> @llvm.experimental.vp.reverse(<vscale x 4 x i32> %a, <vscale x 4 x i1> splat (i1 true), i32 %evl)
1310
%b.rev = tail call <vscale x 4 x i32> @llvm.experimental.vp.reverse(<vscale x 4 x i32> %b, <vscale x 4 x i1> splat (i1 true), i32 %evl)
@@ -16,8 +13,10 @@ define <vscale x 4 x i32> @binop_reverse_elim(<vscale x 4 x i32> %a, <vscale x 4
1613
ret <vscale x 4 x i32> %add.rev
1714
}
1815

19-
define <vscale x 4 x i32> @binop_reverse_elim2(<vscale x 4 x i32> %a, <vscale x 4 x i32> %b, <vscale x 4 x i1> %m, i32 %evl) {
20-
; CHECK-LABEL: @binop_reverse_elim2(
16+
; Negative test - the mask needs to be reversed between the inner and
17+
; the outer to be correct.
18+
define <vscale x 4 x i32> @binop_reverse_elim_samemask(<vscale x 4 x i32> %a, <vscale x 4 x i32> %b, <vscale x 4 x i1> %m, i32 %evl) {
19+
; CHECK-LABEL: @binop_reverse_elim_samemask(
2120
; CHECK-NEXT: [[A_REV:%.*]] = tail call <vscale x 4 x i32> @llvm.experimental.vp.reverse.nxv4i32(<vscale x 4 x i32> [[A:%.*]], <vscale x 4 x i1> [[M:%.*]], i32 [[EVL:%.*]])
2221
; CHECK-NEXT: [[B_REV:%.*]] = tail call <vscale x 4 x i32> @llvm.experimental.vp.reverse.nxv4i32(<vscale x 4 x i32> [[B:%.*]], <vscale x 4 x i1> [[M]], i32 [[EVL]])
2322
; CHECK-NEXT: [[ADD:%.*]] = add nsw <vscale x 4 x i32> [[A_REV]], [[B_REV]]
@@ -48,10 +47,9 @@ define <vscale x 4 x i32> @binop_reverse_elim_diffmask(<vscale x 4 x i32> %a, <v
4847

4948
define <vscale x 4 x i32> @binop_reverse_elim_diffevl(<vscale x 4 x i32> %a, <vscale x 4 x i32> %b, i32 %evl) {
5049
; CHECK-LABEL: @binop_reverse_elim_diffevl(
51-
; CHECK-NEXT: [[A_REV:%.*]] = tail call <vscale x 4 x i32> @llvm.experimental.vp.reverse.nxv4i32(<vscale x 4 x i32> [[A:%.*]], <vscale x 4 x i1> splat (i1 true), i32 [[EVL:%.*]])
52-
; CHECK-NEXT: [[B_REV:%.*]] = tail call <vscale x 4 x i32> @llvm.experimental.vp.reverse.nxv4i32(<vscale x 4 x i32> [[B:%.*]], <vscale x 4 x i1> splat (i1 true), i32 [[EVL]])
53-
; CHECK-NEXT: [[ADD:%.*]] = add nsw <vscale x 4 x i32> [[A_REV]], [[B_REV]]
54-
; CHECK-NEXT: [[ADD_REV:%.*]] = tail call <vscale x 4 x i32> @llvm.experimental.vp.reverse.nxv4i32(<vscale x 4 x i32> [[ADD]], <vscale x 4 x i1> splat (i1 true), i32 10)
50+
; CHECK-NEXT: [[ADD:%.*]] = add nsw <vscale x 4 x i32> [[A_REV:%.*]], [[B_REV:%.*]]
51+
; CHECK-NEXT: [[ADD1:%.*]] = call <vscale x 4 x i32> @llvm.experimental.vp.reverse.nxv4i32(<vscale x 4 x i32> [[ADD]], <vscale x 4 x i1> splat (i1 true), i32 [[EVL:%.*]])
52+
; CHECK-NEXT: [[ADD_REV:%.*]] = tail call <vscale x 4 x i32> @llvm.experimental.vp.reverse.nxv4i32(<vscale x 4 x i32> [[ADD1]], <vscale x 4 x i1> splat (i1 true), i32 10)
5553
; CHECK-NEXT: ret <vscale x 4 x i32> [[ADD_REV]]
5654
;
5755
%a.rev = tail call <vscale x 4 x i32> @llvm.experimental.vp.reverse(<vscale x 4 x i32> %a, <vscale x 4 x i1> splat (i1 true), i32 %evl)
@@ -63,10 +61,8 @@ define <vscale x 4 x i32> @binop_reverse_elim_diffevl(<vscale x 4 x i32> %a, <vs
6361

6462
define <vscale x 4 x i32> @binop_reverse_splat_elim(<vscale x 4 x i32> %a, i32 %evl) {
6563
; CHECK-LABEL: @binop_reverse_splat_elim(
66-
; CHECK-NEXT: [[A:%.*]] = tail call <vscale x 4 x i32> @llvm.experimental.vp.reverse.nxv4i32(<vscale x 4 x i32> [[A1:%.*]], <vscale x 4 x i1> splat (i1 true), i32 [[EVL:%.*]])
67-
; CHECK-NEXT: [[ADD1:%.*]] = add nsw <vscale x 4 x i32> [[A]], splat (i32 22)
68-
; CHECK-NEXT: [[ADD_REV:%.*]] = tail call <vscale x 4 x i32> @llvm.experimental.vp.reverse.nxv4i32(<vscale x 4 x i32> [[ADD1]], <vscale x 4 x i1> splat (i1 true), i32 [[EVL]])
69-
; CHECK-NEXT: ret <vscale x 4 x i32> [[ADD_REV]]
64+
; CHECK-NEXT: [[ADD1:%.*]] = add nsw <vscale x 4 x i32> [[A:%.*]], splat (i32 22)
65+
; CHECK-NEXT: ret <vscale x 4 x i32> [[ADD1]]
7066
;
7167
%a.rev = tail call <vscale x 4 x i32> @llvm.experimental.vp.reverse(<vscale x 4 x i32> %a, <vscale x 4 x i1> splat (i1 true), i32 %evl)
7268
%add = add nsw <vscale x 4 x i32> %a.rev, splat (i32 22)
@@ -76,23 +72,49 @@ define <vscale x 4 x i32> @binop_reverse_splat_elim(<vscale x 4 x i32> %a, i32 %
7672

7773
define <vscale x 4 x i32> @binop_reverse_splat_elim2(<vscale x 4 x i32> %a, i32 %evl) {
7874
; CHECK-LABEL: @binop_reverse_splat_elim2(
79-
; CHECK-NEXT: [[A:%.*]] = tail call <vscale x 4 x i32> @llvm.experimental.vp.reverse.nxv4i32(<vscale x 4 x i32> [[A1:%.*]], <vscale x 4 x i1> splat (i1 true), i32 [[EVL:%.*]])
80-
; CHECK-NEXT: [[ADD1:%.*]] = add nsw <vscale x 4 x i32> [[A]], splat (i32 22)
81-
; CHECK-NEXT: [[ADD_REV:%.*]] = tail call <vscale x 4 x i32> @llvm.experimental.vp.reverse.nxv4i32(<vscale x 4 x i32> [[ADD1]], <vscale x 4 x i1> splat (i1 true), i32 [[EVL]])
82-
; CHECK-NEXT: ret <vscale x 4 x i32> [[ADD_REV]]
75+
; CHECK-NEXT: [[ADD1:%.*]] = add nsw <vscale x 4 x i32> [[A:%.*]], splat (i32 22)
76+
; CHECK-NEXT: ret <vscale x 4 x i32> [[ADD1]]
8377
;
8478
%a.rev = tail call <vscale x 4 x i32> @llvm.experimental.vp.reverse(<vscale x 4 x i32> %a, <vscale x 4 x i1> splat (i1 true), i32 %evl)
8579
%add = add nsw <vscale x 4 x i32> splat (i32 22), %a.rev
8680
%add.rev = tail call <vscale x 4 x i32> @llvm.experimental.vp.reverse(<vscale x 4 x i32> %add, <vscale x 4 x i1> splat (i1 true), i32 %evl)
8781
ret <vscale x 4 x i32> %add.rev
8882
}
8983

84+
define <vscale x 4 x i32> @binop_reverse_splat_elim3(<vscale x 4 x i32> %a, i32 %b, i32 %evl) {
85+
; CHECK-LABEL: @binop_reverse_splat_elim3(
86+
; CHECK-NEXT: [[B_INS:%.*]] = insertelement <vscale x 4 x i32> poison, i32 [[B:%.*]], i64 0
87+
; CHECK-NEXT: [[B_VEC:%.*]] = shufflevector <vscale x 4 x i32> [[B_INS]], <vscale x 4 x i32> poison, <vscale x 4 x i32> zeroinitializer
88+
; CHECK-NEXT: [[ADD:%.*]] = add nsw <vscale x 4 x i32> [[B_VEC]], [[A_REV:%.*]]
89+
; CHECK-NEXT: ret <vscale x 4 x i32> [[ADD]]
90+
;
91+
%b.ins = insertelement <vscale x 4 x i32> poison, i32 %b, i32 0
92+
%b.vec = shufflevector <vscale x 4 x i32> %b.ins, <vscale x 4 x i32> poison, <vscale x 4 x i32> zeroinitializer
93+
%a.rev = tail call <vscale x 4 x i32> @llvm.experimental.vp.reverse(<vscale x 4 x i32> %a, <vscale x 4 x i1> splat (i1 true), i32 %evl)
94+
%add = add nsw <vscale x 4 x i32> %b.vec, %a.rev
95+
%add.rev = tail call <vscale x 4 x i32> @llvm.experimental.vp.reverse(<vscale x 4 x i32> %add, <vscale x 4 x i1> splat (i1 true), i32 %evl)
96+
ret <vscale x 4 x i32> %add.rev
97+
}
98+
99+
define <vscale x 4 x i32> @binop_reverse_splat_elim4(<vscale x 4 x i32> %a, i32 %b, i32 %evl) {
100+
; CHECK-LABEL: @binop_reverse_splat_elim4(
101+
; CHECK-NEXT: [[B_INS:%.*]] = insertelement <vscale x 4 x i32> poison, i32 [[B:%.*]], i64 0
102+
; CHECK-NEXT: [[B_VEC:%.*]] = shufflevector <vscale x 4 x i32> [[B_INS]], <vscale x 4 x i32> poison, <vscale x 4 x i32> zeroinitializer
103+
; CHECK-NEXT: [[ADD1:%.*]] = add nsw <vscale x 4 x i32> [[A:%.*]], [[B_VEC]]
104+
; CHECK-NEXT: ret <vscale x 4 x i32> [[ADD1]]
105+
;
106+
%b.ins = insertelement <vscale x 4 x i32> poison, i32 %b, i32 0
107+
%b.vec = shufflevector <vscale x 4 x i32> %b.ins, <vscale x 4 x i32> poison, <vscale x 4 x i32> zeroinitializer
108+
%a.rev = tail call <vscale x 4 x i32> @llvm.experimental.vp.reverse(<vscale x 4 x i32> %a, <vscale x 4 x i1> splat (i1 true), i32 %evl)
109+
%add = add nsw <vscale x 4 x i32> %a.rev, %b.vec
110+
%add.rev = tail call <vscale x 4 x i32> @llvm.experimental.vp.reverse(<vscale x 4 x i32> %add, <vscale x 4 x i1> splat (i1 true), i32 %evl)
111+
ret <vscale x 4 x i32> %add.rev
112+
}
113+
90114
define <vscale x 4 x float> @unop_reverse_splat_elim(<vscale x 4 x float> %a, <vscale x 4 x float> %b, i32 %evl) {
91115
; CHECK-LABEL: @unop_reverse_splat_elim(
92-
; CHECK-NEXT: [[A_REV:%.*]] = tail call <vscale x 4 x float> @llvm.experimental.vp.reverse.nxv4f32(<vscale x 4 x float> [[A:%.*]], <vscale x 4 x i1> splat (i1 true), i32 [[EVL:%.*]])
93-
; CHECK-NEXT: [[OP:%.*]] = fneg <vscale x 4 x float> [[A_REV]]
94-
; CHECK-NEXT: [[OP_REV:%.*]] = tail call <vscale x 4 x float> @llvm.experimental.vp.reverse.nxv4f32(<vscale x 4 x float> [[OP]], <vscale x 4 x i1> splat (i1 true), i32 [[EVL]])
95-
; CHECK-NEXT: ret <vscale x 4 x float> [[OP_REV]]
116+
; CHECK-NEXT: [[OP:%.*]] = fneg <vscale x 4 x float> [[A_REV:%.*]]
117+
; CHECK-NEXT: ret <vscale x 4 x float> [[OP]]
96118
;
97119
%a.rev = tail call <vscale x 4 x float> @llvm.experimental.vp.reverse.nxv4f32(<vscale x 4 x float> %a, <vscale x 4 x i1> splat (i1 true), i32 %evl)
98120
%op = fneg <vscale x 4 x float> %a.rev

0 commit comments

Comments
 (0)