Skip to content

Commit 65c77d1

Browse files
committed
[RISCV] Widen i1 AnyOf reductions
1 parent b9cb338 commit 65c77d1

File tree

3 files changed

+114
-39
lines changed

3 files changed

+114
-39
lines changed

llvm/lib/Target/RISCV/RISCVCodeGenPrepare.cpp

Lines changed: 81 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -25,6 +25,7 @@
2525
#include "llvm/IR/PatternMatch.h"
2626
#include "llvm/InitializePasses.h"
2727
#include "llvm/Pass.h"
28+
#include "llvm/Transforms/Utils/Local.h"
2829

2930
using namespace llvm;
3031

@@ -58,6 +59,7 @@ class RISCVCodeGenPrepare : public FunctionPass,
5859
bool visitAnd(BinaryOperator &BO);
5960
bool visitIntrinsicInst(IntrinsicInst &I);
6061
bool expandVPStrideLoad(IntrinsicInst &I);
62+
bool widenVPMerge(IntrinsicInst &I);
6163
};
6264

6365
} // end anonymous namespace
@@ -103,6 +105,82 @@ bool RISCVCodeGenPrepare::visitAnd(BinaryOperator &BO) {
103105
return true;
104106
}
105107

108+
// With EVL tail folding, an AnyOf reduction will generate an i1 vp.merge like
109+
// follows:
110+
//
111+
// loop:
112+
// %phi = phi <vscale x 4 x i1> [ zeroinitializer, %entry ], [ %rec, %loop ]
113+
// %cmp = icmp ...
114+
// %or = or <vscale x 4 x i1> %phi, %cmp
115+
// %rec = call <vscale x 4 x i1> @llvm.vp.merge(%mask, %or, %phi, %evl)
116+
// ...
117+
// middle:
118+
// %res = call i1 @llvm.vector.reduce.or(<vscale x 4 x i1> %rec)
119+
//
120+
// However RVV doesn't have any tail undisturbed mask instructions and so we
121+
// need a convoluted sequence of mask instructions to lower the i1 vp.merge: see
122+
// llvm/test/CodeGen/RISCV/rvv/vpmerge-sdnode.ll.
123+
//
124+
// To avoid that this widens the i1 vp.merge to an i8 vp.merge, which will
125+
// usually be folded into a masked vor.vv.
126+
//
127+
// loop:
128+
// %phi = phi <vscale x 4 x i8> [ zeroinitializer, %entry ], [ %rec, %loop ]
129+
// %cmp = icmp ...
130+
// %zext = zext <vscale x 4 x i1> %cmp to <vscale x 4 x i8>
131+
// %or = or <vscale x 4 x i8> %phi, %cmp
132+
// %rec = call <vscale x 4 x i8> @llvm.vp.merge(%mask, %or, %phi, %evl)
133+
// %trunc = trunc <vscale x 4 x i8> %rec to <vscale x 4 x i1>
134+
// ...
135+
// middle:
136+
// %res = call i1 @llvm.vector.reduce.or(<vscale x 4 x i1> %rec)
137+
//
138+
// The trunc will normally be sunk outside of the loop, but even if there are
139+
// users inside the loop it is still profitable.
140+
bool RISCVCodeGenPrepare::widenVPMerge(IntrinsicInst &II) {
141+
if (!II.getType()->getScalarType()->isIntegerTy(1))
142+
return false;
143+
144+
Value *Mask, *PhiV, *Cond, *EVL;
145+
146+
using namespace PatternMatch;
147+
if (!match(&II,
148+
m_Intrinsic<Intrinsic::vp_merge>(
149+
m_Value(Mask), m_OneUse(m_c_Or(m_Value(PhiV), m_Value(Cond))),
150+
m_Deferred(PhiV), m_Value(EVL))))
151+
return false;
152+
153+
auto *Phi = dyn_cast<PHINode>(PhiV);
154+
auto *Start = dyn_cast<Constant>(Phi->getIncomingValue(0));
155+
if (!Phi || Phi->getNumUses() > 2 || Phi->getNumIncomingValues() != 2 ||
156+
!(Start && Start->isZeroValue()) || Phi->getIncomingValue(1) != &II)
157+
return false;
158+
159+
Type *WideTy =
160+
VectorType::get(IntegerType::getInt8Ty(II.getContext()),
161+
cast<VectorType>(II.getType())->getElementCount());
162+
163+
IRBuilder<> Builder(Phi);
164+
PHINode *WidePhi = Builder.CreatePHI(WideTy, 2);
165+
WidePhi->addIncoming(ConstantAggregateZero::get(WideTy),
166+
Phi->getIncomingBlock(0));
167+
Builder.SetInsertPoint(&II);
168+
Value *WideCmp = Builder.CreateZExt(Cond, WideTy);
169+
Value *WideOr = Builder.CreateOr(WidePhi, WideCmp);
170+
Value *WideMerge = Builder.CreateIntrinsic(Intrinsic::vp_merge, {WideTy},
171+
{Mask, WideOr, WidePhi, EVL});
172+
WidePhi->addIncoming(WideMerge, Phi->getIncomingBlock(1));
173+
Value *Trunc = Builder.CreateTrunc(WideMerge, II.getType());
174+
175+
II.replaceAllUsesWith(Trunc);
176+
177+
// Break the cycle and delete the old chain.
178+
Phi->setIncomingValue(1, Phi->getIncomingValue(0));
179+
llvm::RecursivelyDeleteTriviallyDeadInstructions(&II);
180+
181+
return true;
182+
}
183+
106184
// LLVM vector reduction intrinsics return a scalar result, but on RISC-V vector
107185
// reduction instructions write the result in the first element of a vector
108186
// register. So when a reduction in a loop uses a scalar phi, we end up with
@@ -138,6 +216,9 @@ bool RISCVCodeGenPrepare::visitIntrinsicInst(IntrinsicInst &I) {
138216
if (expandVPStrideLoad(I))
139217
return true;
140218

219+
if (widenVPMerge(I))
220+
return true;
221+
141222
if (I.getIntrinsicID() != Intrinsic::vector_reduce_fadd &&
142223
!isa<VPReductionIntrinsic>(&I))
143224
return false;

llvm/test/CodeGen/RISCV/riscv-codegenprepare-asm.ll

Lines changed: 23 additions & 33 deletions
Original file line numberDiff line numberDiff line change
@@ -132,30 +132,25 @@ define i1 @widen_anyof_rdx(ptr %p, i64 %n) {
132132
; CHECK-LABEL: widen_anyof_rdx:
133133
; CHECK: # %bb.0: # %entry
134134
; CHECK-NEXT: li a2, 0
135-
; CHECK-NEXT: vsetvli a3, zero, e64, m4, ta, ma
136-
; CHECK-NEXT: vmclr.m v12
137-
; CHECK-NEXT: vid.v v8
135+
; CHECK-NEXT: vsetvli a3, zero, e8, mf2, ta, ma
136+
; CHECK-NEXT: vmv.v.i v8, 0
138137
; CHECK-NEXT: .LBB2_1: # %loop
139138
; CHECK-NEXT: # =>This Inner Loop Header: Depth=1
140139
; CHECK-NEXT: sub a3, a1, a2
141140
; CHECK-NEXT: slli a4, a2, 2
142-
; CHECK-NEXT: vsetvli a3, a3, e8, mf2, ta, ma
141+
; CHECK-NEXT: vsetvli a3, a3, e32, m2, ta, ma
143142
; CHECK-NEXT: add a4, a0, a4
144-
; CHECK-NEXT: vle32.v v14, (a4)
145-
; CHECK-NEXT: vsetvli a4, zero, e64, m4, ta, ma
146-
; CHECK-NEXT: vmv.v.x v16, a3
147-
; CHECK-NEXT: vmsleu.vv v13, v16, v8
148-
; CHECK-NEXT: vmsltu.vx v16, v8, a3
149-
; CHECK-NEXT: vmand.mm v13, v12, v13
150-
; CHECK-NEXT: vsetvli zero, zero, e32, m2, ta, ma
151-
; CHECK-NEXT: vmsne.vi v17, v14, 0
152-
; CHECK-NEXT: vmor.mm v12, v12, v17
153-
; CHECK-NEXT: vmand.mm v12, v12, v16
143+
; CHECK-NEXT: vle32.v v10, (a4)
144+
; CHECK-NEXT: vmsne.vi v0, v10, 0
154145
; CHECK-NEXT: add a2, a2, a3
155-
; CHECK-NEXT: vmor.mm v12, v12, v13
146+
; CHECK-NEXT: vsetvli zero, zero, e8, mf2, tu, mu
147+
; CHECK-NEXT: vor.vi v8, v8, 1, v0.t
156148
; CHECK-NEXT: blt a2, a1, .LBB2_1
157149
; CHECK-NEXT: # %bb.2: # %exit
158-
; CHECK-NEXT: vcpop.m a0, v12
150+
; CHECK-NEXT: vsetvli a0, zero, e8, mf2, ta, ma
151+
; CHECK-NEXT: vand.vi v8, v8, 1
152+
; CHECK-NEXT: vmsne.vi v8, v8, 0
153+
; CHECK-NEXT: vcpop.m a0, v8
159154
; CHECK-NEXT: snez a0, a0
160155
; CHECK-NEXT: ret
161156
entry:
@@ -186,31 +181,26 @@ define i1 @widen_anyof_rdx_use_in_loop(ptr %p, i64 %n) {
186181
; CHECK-LABEL: widen_anyof_rdx_use_in_loop:
187182
; CHECK: # %bb.0: # %entry
188183
; CHECK-NEXT: li a2, 0
189-
; CHECK-NEXT: vsetvli a3, zero, e64, m4, ta, ma
190-
; CHECK-NEXT: vmclr.m v12
191-
; CHECK-NEXT: vid.v v8
184+
; CHECK-NEXT: vsetvli a3, zero, e8, mf2, ta, ma
185+
; CHECK-NEXT: vmv.v.i v8, 0
192186
; CHECK-NEXT: .LBB3_1: # %loop
193187
; CHECK-NEXT: # =>This Inner Loop Header: Depth=1
194188
; CHECK-NEXT: sub a3, a1, a2
195189
; CHECK-NEXT: slli a4, a2, 2
196-
; CHECK-NEXT: vsetvli a3, a3, e8, mf2, ta, ma
190+
; CHECK-NEXT: vsetvli a3, a3, e32, m2, ta, ma
197191
; CHECK-NEXT: add a4, a0, a4
198-
; CHECK-NEXT: vle32.v v14, (a4)
199-
; CHECK-NEXT: vsetvli a5, zero, e64, m4, ta, ma
200-
; CHECK-NEXT: vmv.v.x v16, a3
201-
; CHECK-NEXT: vmsleu.vv v13, v16, v8
202-
; CHECK-NEXT: vmsltu.vx v16, v8, a3
203-
; CHECK-NEXT: vmand.mm v13, v12, v13
204-
; CHECK-NEXT: vsetvli zero, zero, e32, m2, ta, ma
205-
; CHECK-NEXT: vmsne.vi v17, v14, 0
206-
; CHECK-NEXT: vmor.mm v12, v12, v17
207-
; CHECK-NEXT: vmand.mm v12, v12, v16
208-
; CHECK-NEXT: vmor.mm v12, v12, v13
192+
; CHECK-NEXT: vle32.v v10, (a4)
193+
; CHECK-NEXT: vmsne.vi v0, v10, 0
194+
; CHECK-NEXT: vsetvli zero, zero, e8, mf2, tu, mu
195+
; CHECK-NEXT: vor.vi v8, v8, 1, v0.t
196+
; CHECK-NEXT: vsetvli a5, zero, e8, mf2, ta, ma
197+
; CHECK-NEXT: vand.vi v9, v8, 1
198+
; CHECK-NEXT: vmsne.vi v9, v9, 0
209199
; CHECK-NEXT: add a2, a2, a3
210-
; CHECK-NEXT: vsm.v v12, (a4)
200+
; CHECK-NEXT: vsm.v v9, (a4)
211201
; CHECK-NEXT: blt a2, a1, .LBB3_1
212202
; CHECK-NEXT: # %bb.2: # %exit
213-
; CHECK-NEXT: vcpop.m a0, v12
203+
; CHECK-NEXT: vcpop.m a0, v9
214204
; CHECK-NEXT: snez a0, a0
215205
; CHECK-NEXT: ret
216206
entry:

llvm/test/CodeGen/RISCV/riscv-codegenprepare.ll

Lines changed: 10 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -110,14 +110,16 @@ define i1 @widen_anyof_rdx(ptr %p, i64 %n) {
110110
; CHECK-NEXT: br label [[LOOP:%.*]]
111111
; CHECK: loop:
112112
; CHECK-NEXT: [[IV:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ], [ [[IV_NEXT:%.*]], [[LOOP]] ]
113-
; CHECK-NEXT: [[PHI:%.*]] = phi <vscale x 4 x i1> [ zeroinitializer, [[ENTRY]] ], [ [[TMP4:%.*]], [[LOOP]] ]
113+
; CHECK-NEXT: [[TMP0:%.*]] = phi <vscale x 4 x i8> [ zeroinitializer, [[ENTRY]] ], [ [[TMP3:%.*]], [[LOOP]] ]
114114
; CHECK-NEXT: [[AVL:%.*]] = sub i64 [[N:%.*]], [[IV]]
115115
; CHECK-NEXT: [[EVL:%.*]] = call i32 @llvm.experimental.get.vector.length.i64(i64 [[AVL]], i32 4, i1 true)
116116
; CHECK-NEXT: [[GEP:%.*]] = getelementptr i32, ptr [[P:%.*]], i64 [[IV]]
117117
; CHECK-NEXT: [[X:%.*]] = call <vscale x 4 x i32> @llvm.vp.load.nxv4i32.p0(ptr [[GEP]], <vscale x 4 x i1> splat (i1 true), i32 [[EVL]])
118118
; CHECK-NEXT: [[CMP:%.*]] = icmp ne <vscale x 4 x i32> [[X]], zeroinitializer
119-
; CHECK-NEXT: [[OR:%.*]] = or <vscale x 4 x i1> [[PHI]], [[CMP]]
120-
; CHECK-NEXT: [[TMP4]] = call <vscale x 4 x i1> @llvm.vp.merge.nxv4i1(<vscale x 4 x i1> splat (i1 true), <vscale x 4 x i1> [[OR]], <vscale x 4 x i1> [[PHI]], i32 [[EVL]])
119+
; CHECK-NEXT: [[TMP1:%.*]] = zext <vscale x 4 x i1> [[CMP]] to <vscale x 4 x i8>
120+
; CHECK-NEXT: [[TMP2:%.*]] = or <vscale x 4 x i8> [[TMP0]], [[TMP1]]
121+
; CHECK-NEXT: [[TMP3]] = call <vscale x 4 x i8> @llvm.vp.merge.nxv4i8(<vscale x 4 x i1> splat (i1 true), <vscale x 4 x i8> [[TMP2]], <vscale x 4 x i8> [[TMP0]], i32 [[EVL]])
122+
; CHECK-NEXT: [[TMP4:%.*]] = trunc <vscale x 4 x i8> [[TMP3]] to <vscale x 4 x i1>
121123
; CHECK-NEXT: [[EVL_ZEXT:%.*]] = zext i32 [[EVL]] to i64
122124
; CHECK-NEXT: [[IV_NEXT]] = add i64 [[IV]], [[EVL_ZEXT]]
123125
; CHECK-NEXT: [[DONE:%.*]] = icmp sge i64 [[IV_NEXT]], [[N]]
@@ -156,14 +158,16 @@ define i1 @widen_anyof_rdx_use_in_loop(ptr %p, i64 %n) {
156158
; CHECK-NEXT: br label [[LOOP:%.*]]
157159
; CHECK: loop:
158160
; CHECK-NEXT: [[IV:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ], [ [[IV_NEXT:%.*]], [[LOOP]] ]
159-
; CHECK-NEXT: [[PHI:%.*]] = phi <vscale x 4 x i1> [ zeroinitializer, [[ENTRY]] ], [ [[REC:%.*]], [[LOOP]] ]
161+
; CHECK-NEXT: [[TMP0:%.*]] = phi <vscale x 4 x i8> [ zeroinitializer, [[ENTRY]] ], [ [[TMP3:%.*]], [[LOOP]] ]
160162
; CHECK-NEXT: [[AVL:%.*]] = sub i64 [[N:%.*]], [[IV]]
161163
; CHECK-NEXT: [[EVL:%.*]] = call i32 @llvm.experimental.get.vector.length.i64(i64 [[AVL]], i32 4, i1 true)
162164
; CHECK-NEXT: [[GEP:%.*]] = getelementptr i32, ptr [[P:%.*]], i64 [[IV]]
163165
; CHECK-NEXT: [[X:%.*]] = call <vscale x 4 x i32> @llvm.vp.load.nxv4i32.p0(ptr [[GEP]], <vscale x 4 x i1> splat (i1 true), i32 [[EVL]])
164166
; CHECK-NEXT: [[CMP:%.*]] = icmp ne <vscale x 4 x i32> [[X]], zeroinitializer
165-
; CHECK-NEXT: [[OR:%.*]] = or <vscale x 4 x i1> [[PHI]], [[CMP]]
166-
; CHECK-NEXT: [[REC]] = call <vscale x 4 x i1> @llvm.vp.merge.nxv4i1(<vscale x 4 x i1> splat (i1 true), <vscale x 4 x i1> [[OR]], <vscale x 4 x i1> [[PHI]], i32 [[EVL]])
167+
; CHECK-NEXT: [[TMP1:%.*]] = zext <vscale x 4 x i1> [[CMP]] to <vscale x 4 x i8>
168+
; CHECK-NEXT: [[TMP2:%.*]] = or <vscale x 4 x i8> [[TMP0]], [[TMP1]]
169+
; CHECK-NEXT: [[TMP3]] = call <vscale x 4 x i8> @llvm.vp.merge.nxv4i8(<vscale x 4 x i1> splat (i1 true), <vscale x 4 x i8> [[TMP2]], <vscale x 4 x i8> [[TMP0]], i32 [[EVL]])
170+
; CHECK-NEXT: [[REC:%.*]] = trunc <vscale x 4 x i8> [[TMP3]] to <vscale x 4 x i1>
167171
; CHECK-NEXT: store <vscale x 4 x i1> [[REC]], ptr [[GEP]], align 1
168172
; CHECK-NEXT: [[EVL_ZEXT:%.*]] = zext i32 [[EVL]] to i64
169173
; CHECK-NEXT: [[IV_NEXT]] = add i64 [[IV]], [[EVL_ZEXT]]

0 commit comments

Comments
 (0)