Skip to content

Commit 7329ba1

Browse files
lukel97chencha3
authored andcommitted
[RISCV] Recursively split concat_vector into smaller LMULs when lowering (llvm#85825)
This is a reimplementation of the combine added in llvm#83035 but as a lowering instead of a combine, so we don't regress the test case added in e59f120 by interfering with the strided load combine Previously the combine had to concatenate the split vectors with insert_subvector instead of concat_vectors to prevent an infinite combine loop. And the reasoning behind keeping it as a combine was because if we emitted the insert_subvector during lowering then we didn't fold away inserts of undef subvectors. However it turns out we can avoid this if we just do this in lowering and select a concat_vector directly, since we get the undef folding for free with `DAG.getNode(ISD::CONCAT_VECTOR, ...)` via foldCONCAT_VECTORS.
1 parent 8e1dc05 commit 7329ba1

9 files changed

+844
-753
lines changed

llvm/lib/Target/RISCV/RISCVISelLowering.cpp

Lines changed: 24 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -6611,6 +6611,30 @@ SDValue RISCVTargetLowering::LowerOperation(SDValue Op,
66116611
// better than going through the stack, as the default expansion does.
66126612
SDLoc DL(Op);
66136613
MVT VT = Op.getSimpleValueType();
6614+
MVT ContainerVT = VT;
6615+
if (VT.isFixedLengthVector())
6616+
ContainerVT = ::getContainerForFixedLengthVector(DAG, VT, Subtarget);
6617+
6618+
// Recursively split concat_vectors with more than 2 operands:
6619+
//
6620+
// concat_vector op1, op2, op3, op4
6621+
// ->
6622+
// concat_vector (concat_vector op1, op2), (concat_vector op3, op4)
6623+
//
6624+
// This reduces the length of the chain of vslideups and allows us to
6625+
// perform the vslideups at a smaller LMUL, limited to MF2.
6626+
if (Op.getNumOperands() > 2 &&
6627+
ContainerVT.bitsGE(getLMUL1VT(ContainerVT))) {
6628+
MVT HalfVT = VT.getHalfNumVectorElementsVT();
6629+
assert(isPowerOf2_32(Op.getNumOperands()));
6630+
size_t HalfNumOps = Op.getNumOperands() / 2;
6631+
SDValue Lo = DAG.getNode(ISD::CONCAT_VECTORS, DL, HalfVT,
6632+
Op->ops().take_front(HalfNumOps));
6633+
SDValue Hi = DAG.getNode(ISD::CONCAT_VECTORS, DL, HalfVT,
6634+
Op->ops().drop_front(HalfNumOps));
6635+
return DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, Lo, Hi);
6636+
}
6637+
66146638
unsigned NumOpElts =
66156639
Op.getOperand(0).getSimpleValueType().getVectorMinNumElements();
66166640
SDValue Vec = DAG.getUNDEF(VT);

llvm/test/CodeGen/RISCV/rvv/active_lane_mask.ll

Lines changed: 45 additions & 46 deletions
Original file line numberDiff line numberDiff line change
@@ -161,72 +161,71 @@ define <64 x i1> @fv64(ptr %p, i64 %index, i64 %tc) {
161161
define <128 x i1> @fv128(ptr %p, i64 %index, i64 %tc) {
162162
; CHECK-LABEL: fv128:
163163
; CHECK: # %bb.0:
164-
; CHECK-NEXT: vsetivli zero, 16, e64, m8, ta, ma
165164
; CHECK-NEXT: lui a0, %hi(.LCPI10_0)
166165
; CHECK-NEXT: addi a0, a0, %lo(.LCPI10_0)
166+
; CHECK-NEXT: vsetivli zero, 16, e64, m8, ta, ma
167167
; CHECK-NEXT: vle8.v v8, (a0)
168-
; CHECK-NEXT: vid.v v16
169-
; CHECK-NEXT: vsaddu.vx v16, v16, a1
170-
; CHECK-NEXT: vmsltu.vx v0, v16, a2
171-
; CHECK-NEXT: vsext.vf8 v16, v8
172-
; CHECK-NEXT: vsaddu.vx v8, v16, a1
173-
; CHECK-NEXT: vmsltu.vx v16, v8, a2
174-
; CHECK-NEXT: vsetivli zero, 4, e8, m1, tu, ma
175-
; CHECK-NEXT: vslideup.vi v0, v16, 2
176168
; CHECK-NEXT: lui a0, %hi(.LCPI10_1)
177169
; CHECK-NEXT: addi a0, a0, %lo(.LCPI10_1)
178-
; CHECK-NEXT: vsetivli zero, 16, e64, m8, ta, ma
179-
; CHECK-NEXT: vle8.v v8, (a0)
170+
; CHECK-NEXT: vle8.v v9, (a0)
180171
; CHECK-NEXT: vsext.vf8 v16, v8
181-
; CHECK-NEXT: vsaddu.vx v8, v16, a1
182-
; CHECK-NEXT: vmsltu.vx v16, v8, a2
183-
; CHECK-NEXT: vsetivli zero, 6, e8, m1, tu, ma
184-
; CHECK-NEXT: vslideup.vi v0, v16, 4
172+
; CHECK-NEXT: vsaddu.vx v16, v16, a1
173+
; CHECK-NEXT: vmsltu.vx v10, v16, a2
174+
; CHECK-NEXT: vsext.vf8 v16, v9
175+
; CHECK-NEXT: vsaddu.vx v16, v16, a1
176+
; CHECK-NEXT: vmsltu.vx v8, v16, a2
177+
; CHECK-NEXT: vsetivli zero, 4, e8, mf2, tu, ma
178+
; CHECK-NEXT: vslideup.vi v8, v10, 2
185179
; CHECK-NEXT: lui a0, %hi(.LCPI10_2)
186180
; CHECK-NEXT: addi a0, a0, %lo(.LCPI10_2)
187181
; CHECK-NEXT: vsetivli zero, 16, e64, m8, ta, ma
188-
; CHECK-NEXT: vle8.v v8, (a0)
189-
; CHECK-NEXT: vsext.vf8 v16, v8
190-
; CHECK-NEXT: vsaddu.vx v8, v16, a1
191-
; CHECK-NEXT: vmsltu.vx v16, v8, a2
192-
; CHECK-NEXT: vsetivli zero, 8, e8, m1, tu, ma
193-
; CHECK-NEXT: vslideup.vi v0, v16, 6
182+
; CHECK-NEXT: vle8.v v9, (a0)
183+
; CHECK-NEXT: vsext.vf8 v16, v9
184+
; CHECK-NEXT: vsaddu.vx v16, v16, a1
185+
; CHECK-NEXT: vmsltu.vx v9, v16, a2
186+
; CHECK-NEXT: vsetivli zero, 6, e8, mf2, tu, ma
187+
; CHECK-NEXT: vslideup.vi v8, v9, 4
194188
; CHECK-NEXT: lui a0, %hi(.LCPI10_3)
195189
; CHECK-NEXT: addi a0, a0, %lo(.LCPI10_3)
196190
; CHECK-NEXT: vsetivli zero, 16, e64, m8, ta, ma
197-
; CHECK-NEXT: vle8.v v8, (a0)
198-
; CHECK-NEXT: vsext.vf8 v16, v8
199-
; CHECK-NEXT: vsaddu.vx v8, v16, a1
200-
; CHECK-NEXT: vmsltu.vx v16, v8, a2
201-
; CHECK-NEXT: vsetivli zero, 10, e8, m1, tu, ma
202-
; CHECK-NEXT: vslideup.vi v0, v16, 8
191+
; CHECK-NEXT: vle8.v v9, (a0)
192+
; CHECK-NEXT: vsext.vf8 v16, v9
193+
; CHECK-NEXT: vsaddu.vx v16, v16, a1
194+
; CHECK-NEXT: vmsltu.vx v9, v16, a2
195+
; CHECK-NEXT: vsetivli zero, 8, e8, mf2, ta, ma
196+
; CHECK-NEXT: vslideup.vi v8, v9, 6
197+
; CHECK-NEXT: vsetivli zero, 16, e64, m8, ta, ma
203198
; CHECK-NEXT: lui a0, %hi(.LCPI10_4)
204199
; CHECK-NEXT: addi a0, a0, %lo(.LCPI10_4)
205-
; CHECK-NEXT: vsetivli zero, 16, e64, m8, ta, ma
206-
; CHECK-NEXT: vle8.v v8, (a0)
207-
; CHECK-NEXT: vsext.vf8 v16, v8
208-
; CHECK-NEXT: vsaddu.vx v8, v16, a1
209-
; CHECK-NEXT: vmsltu.vx v16, v8, a2
210-
; CHECK-NEXT: vsetivli zero, 12, e8, m1, tu, ma
211-
; CHECK-NEXT: vslideup.vi v0, v16, 10
200+
; CHECK-NEXT: vle8.v v9, (a0)
201+
; CHECK-NEXT: vid.v v16
202+
; CHECK-NEXT: vsaddu.vx v16, v16, a1
203+
; CHECK-NEXT: vmsltu.vx v0, v16, a2
204+
; CHECK-NEXT: vsext.vf8 v16, v9
205+
; CHECK-NEXT: vsaddu.vx v16, v16, a1
206+
; CHECK-NEXT: vmsltu.vx v9, v16, a2
207+
; CHECK-NEXT: vsetivli zero, 4, e8, mf2, tu, ma
208+
; CHECK-NEXT: vslideup.vi v0, v9, 2
212209
; CHECK-NEXT: lui a0, %hi(.LCPI10_5)
213210
; CHECK-NEXT: addi a0, a0, %lo(.LCPI10_5)
214211
; CHECK-NEXT: vsetivli zero, 16, e64, m8, ta, ma
215-
; CHECK-NEXT: vle8.v v8, (a0)
216-
; CHECK-NEXT: vsext.vf8 v16, v8
217-
; CHECK-NEXT: vsaddu.vx v8, v16, a1
218-
; CHECK-NEXT: vmsltu.vx v16, v8, a2
219-
; CHECK-NEXT: vsetivli zero, 14, e8, m1, tu, ma
220-
; CHECK-NEXT: vslideup.vi v0, v16, 12
212+
; CHECK-NEXT: vle8.v v9, (a0)
213+
; CHECK-NEXT: vsext.vf8 v16, v9
214+
; CHECK-NEXT: vsaddu.vx v16, v16, a1
215+
; CHECK-NEXT: vmsltu.vx v9, v16, a2
216+
; CHECK-NEXT: vsetivli zero, 6, e8, mf2, tu, ma
217+
; CHECK-NEXT: vslideup.vi v0, v9, 4
221218
; CHECK-NEXT: lui a0, %hi(.LCPI10_6)
222219
; CHECK-NEXT: addi a0, a0, %lo(.LCPI10_6)
223220
; CHECK-NEXT: vsetivli zero, 16, e64, m8, ta, ma
224-
; CHECK-NEXT: vle8.v v8, (a0)
225-
; CHECK-NEXT: vsext.vf8 v16, v8
226-
; CHECK-NEXT: vsaddu.vx v8, v16, a1
227-
; CHECK-NEXT: vmsltu.vx v16, v8, a2
228-
; CHECK-NEXT: vsetvli zero, zero, e8, m1, ta, ma
229-
; CHECK-NEXT: vslideup.vi v0, v16, 14
221+
; CHECK-NEXT: vle8.v v9, (a0)
222+
; CHECK-NEXT: vsext.vf8 v16, v9
223+
; CHECK-NEXT: vsaddu.vx v16, v16, a1
224+
; CHECK-NEXT: vmsltu.vx v9, v16, a2
225+
; CHECK-NEXT: vsetivli zero, 8, e8, mf2, ta, ma
226+
; CHECK-NEXT: vslideup.vi v0, v9, 6
227+
; CHECK-NEXT: vsetivli zero, 16, e8, m1, ta, ma
228+
; CHECK-NEXT: vslideup.vi v0, v8, 8
230229
; CHECK-NEXT: ret
231230
%mask = call <128 x i1> @llvm.get.active.lane.mask.v128i1.i64(i64 %index, i64 %tc)
232231
ret <128 x i1> %mask

llvm/test/CodeGen/RISCV/rvv/combine-store-extract-crash.ll

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -19,7 +19,7 @@ define void @test(ptr %ref_array, ptr %sad_array) {
1919
; RV32-NEXT: th.swia a0, (a1), 4, 0
2020
; RV32-NEXT: vsetivli zero, 4, e8, mf4, ta, ma
2121
; RV32-NEXT: vle8.v v10, (a3)
22-
; RV32-NEXT: vsetivli zero, 8, e8, m1, tu, ma
22+
; RV32-NEXT: vsetivli zero, 8, e8, mf2, ta, ma
2323
; RV32-NEXT: vslideup.vi v10, v9, 4
2424
; RV32-NEXT: vsetivli zero, 16, e32, m4, ta, ma
2525
; RV32-NEXT: vzext.vf4 v12, v10
@@ -42,7 +42,7 @@ define void @test(ptr %ref_array, ptr %sad_array) {
4242
; RV64-NEXT: th.swia a0, (a1), 4, 0
4343
; RV64-NEXT: vsetivli zero, 4, e8, mf4, ta, ma
4444
; RV64-NEXT: vle8.v v10, (a3)
45-
; RV64-NEXT: vsetivli zero, 8, e8, m1, tu, ma
45+
; RV64-NEXT: vsetivli zero, 8, e8, mf2, ta, ma
4646
; RV64-NEXT: vslideup.vi v10, v9, 4
4747
; RV64-NEXT: vsetivli zero, 16, e32, m4, ta, ma
4848
; RV64-NEXT: vzext.vf4 v12, v10

llvm/test/CodeGen/RISCV/rvv/extract-subvector.ll

Lines changed: 1 addition & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -469,9 +469,8 @@ define <vscale x 6 x half> @extract_nxv6f16_nxv12f16_6(<vscale x 12 x half> %in)
469469
; CHECK: # %bb.0:
470470
; CHECK-NEXT: csrr a0, vlenb
471471
; CHECK-NEXT: srli a0, a0, 2
472-
; CHECK-NEXT: vsetvli zero, a0, e16, m1, ta, ma
473-
; CHECK-NEXT: vslidedown.vx v13, v10, a0
474472
; CHECK-NEXT: vsetvli a1, zero, e16, m1, ta, ma
473+
; CHECK-NEXT: vslidedown.vx v13, v10, a0
475474
; CHECK-NEXT: vslidedown.vx v12, v9, a0
476475
; CHECK-NEXT: add a1, a0, a0
477476
; CHECK-NEXT: vsetvli zero, a1, e16, m1, ta, ma

0 commit comments

Comments
 (0)