Skip to content

Commit c59129a

Browse files
authored
[RISCV] Recursively split concat_vector into smaller LMULs (#83035)
This is the concat_vector equivalent of #81312, in that we recursively split concat_vectors with more than two operands into smaller concat_vectors. This allows us to break up the chain of vslideups, as well as perform the vslideups at a smaller LMUL, which in turn reduces register pressure as the previous lowering performed N vslideups at the highest result LMUL. For now, it stops splitting past MF2. This is done as a DAG combine so that any undef operands are combined away: If we do this during lowering then we end up with unnecessary vslideups of undefs.
1 parent 99500e8 commit c59129a

9 files changed

+926
-781
lines changed

llvm/lib/Target/RISCV/RISCVISelLowering.cpp

Lines changed: 56 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -15283,13 +15283,62 @@ static SDValue performINSERT_VECTOR_ELTCombine(SDNode *N, SelectionDAG &DAG,
1528315283
return DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, ConcatOps);
1528415284
}
1528515285

15286+
// Recursively split up concat_vectors with more than 2 operands:
15287+
//
15288+
// concat_vector op1, op2, op3, op4
15289+
// ->
15290+
// concat_vector (concat_vector op1, op2), (concat_vector op3, op4)
15291+
//
15292+
// This reduces the length of the chain of vslideups and allows us to perform
15293+
// the vslideups at a smaller LMUL, limited to MF2.
15294+
//
15295+
// We do this as a DAG combine rather than during lowering so that any undef
15296+
// operands can get combined away.
15297+
static SDValue
15298+
performCONCAT_VECTORSSplitCombine(SDNode *N, SelectionDAG &DAG,
15299+
const RISCVTargetLowering &TLI) {
15300+
SDLoc DL(N);
15301+
15302+
if (N->getNumOperands() <= 2)
15303+
return SDValue();
15304+
15305+
if (!TLI.isTypeLegal(N->getValueType(0)))
15306+
return SDValue();
15307+
MVT VT = N->getSimpleValueType(0);
15308+
15309+
// Don't split any further than MF2.
15310+
MVT ContainerVT = VT;
15311+
if (VT.isFixedLengthVector())
15312+
ContainerVT = getContainerForFixedLengthVector(DAG, VT, TLI.getSubtarget());
15313+
if (ContainerVT.bitsLT(getLMUL1VT(ContainerVT)))
15314+
return SDValue();
15315+
15316+
MVT HalfVT = VT.getHalfNumVectorElementsVT();
15317+
assert(isPowerOf2_32(N->getNumOperands()));
15318+
size_t HalfNumOps = N->getNumOperands() / 2;
15319+
SDValue Lo = DAG.getNode(ISD::CONCAT_VECTORS, DL, HalfVT,
15320+
N->ops().take_front(HalfNumOps));
15321+
SDValue Hi = DAG.getNode(ISD::CONCAT_VECTORS, DL, HalfVT,
15322+
N->ops().drop_front(HalfNumOps));
15323+
15324+
// Lower to an insert_subvector directly so the concat_vectors don't get
15325+
// recombined.
15326+
SDValue Vec = DAG.getNode(ISD::INSERT_SUBVECTOR, DL, VT, DAG.getUNDEF(VT), Lo,
15327+
DAG.getVectorIdxConstant(0, DL));
15328+
Vec = DAG.getNode(
15329+
ISD::INSERT_SUBVECTOR, DL, VT, Vec, Hi,
15330+
DAG.getVectorIdxConstant(HalfVT.getVectorMinNumElements(), DL));
15331+
return Vec;
15332+
}
15333+
1528615334
// If we're concatenating a series of vector loads like
1528715335
// concat_vectors (load v4i8, p+0), (load v4i8, p+n), (load v4i8, p+n*2) ...
1528815336
// Then we can turn this into a strided load by widening the vector elements
1528915337
// vlse32 p, stride=n
15290-
static SDValue performCONCAT_VECTORSCombine(SDNode *N, SelectionDAG &DAG,
15291-
const RISCVSubtarget &Subtarget,
15292-
const RISCVTargetLowering &TLI) {
15338+
static SDValue
15339+
performCONCAT_VECTORSStridedLoadCombine(SDNode *N, SelectionDAG &DAG,
15340+
const RISCVSubtarget &Subtarget,
15341+
const RISCVTargetLowering &TLI) {
1529315342
SDLoc DL(N);
1529415343
EVT VT = N->getValueType(0);
1529515344

@@ -16394,7 +16443,10 @@ SDValue RISCVTargetLowering::PerformDAGCombine(SDNode *N,
1639416443
return V;
1639516444
break;
1639616445
case ISD::CONCAT_VECTORS:
16397-
if (SDValue V = performCONCAT_VECTORSCombine(N, DAG, Subtarget, *this))
16446+
if (SDValue V =
16447+
performCONCAT_VECTORSStridedLoadCombine(N, DAG, Subtarget, *this))
16448+
return V;
16449+
if (SDValue V = performCONCAT_VECTORSSplitCombine(N, DAG, *this))
1639816450
return V;
1639916451
break;
1640016452
case ISD::INSERT_VECTOR_ELT:

llvm/test/CodeGen/RISCV/rvv/active_lane_mask.ll

Lines changed: 45 additions & 46 deletions
Original file line numberDiff line numberDiff line change
@@ -161,72 +161,71 @@ define <64 x i1> @fv64(ptr %p, i64 %index, i64 %tc) {
161161
define <128 x i1> @fv128(ptr %p, i64 %index, i64 %tc) {
162162
; CHECK-LABEL: fv128:
163163
; CHECK: # %bb.0:
164-
; CHECK-NEXT: vsetivli zero, 16, e64, m8, ta, ma
165164
; CHECK-NEXT: lui a0, %hi(.LCPI10_0)
166165
; CHECK-NEXT: addi a0, a0, %lo(.LCPI10_0)
166+
; CHECK-NEXT: vsetivli zero, 16, e64, m8, ta, ma
167167
; CHECK-NEXT: vle8.v v8, (a0)
168-
; CHECK-NEXT: vid.v v16
169-
; CHECK-NEXT: vsaddu.vx v16, v16, a1
170-
; CHECK-NEXT: vmsltu.vx v0, v16, a2
171-
; CHECK-NEXT: vsext.vf8 v16, v8
172-
; CHECK-NEXT: vsaddu.vx v8, v16, a1
173-
; CHECK-NEXT: vmsltu.vx v16, v8, a2
174-
; CHECK-NEXT: vsetivli zero, 4, e8, m1, tu, ma
175-
; CHECK-NEXT: vslideup.vi v0, v16, 2
176168
; CHECK-NEXT: lui a0, %hi(.LCPI10_1)
177169
; CHECK-NEXT: addi a0, a0, %lo(.LCPI10_1)
178-
; CHECK-NEXT: vsetivli zero, 16, e64, m8, ta, ma
179-
; CHECK-NEXT: vle8.v v8, (a0)
170+
; CHECK-NEXT: vle8.v v9, (a0)
180171
; CHECK-NEXT: vsext.vf8 v16, v8
181-
; CHECK-NEXT: vsaddu.vx v8, v16, a1
182-
; CHECK-NEXT: vmsltu.vx v16, v8, a2
183-
; CHECK-NEXT: vsetivli zero, 6, e8, m1, tu, ma
184-
; CHECK-NEXT: vslideup.vi v0, v16, 4
172+
; CHECK-NEXT: vsaddu.vx v16, v16, a1
173+
; CHECK-NEXT: vmsltu.vx v10, v16, a2
174+
; CHECK-NEXT: vsext.vf8 v16, v9
175+
; CHECK-NEXT: vsaddu.vx v16, v16, a1
176+
; CHECK-NEXT: vmsltu.vx v8, v16, a2
177+
; CHECK-NEXT: vsetivli zero, 4, e8, mf2, tu, ma
178+
; CHECK-NEXT: vslideup.vi v8, v10, 2
185179
; CHECK-NEXT: lui a0, %hi(.LCPI10_2)
186180
; CHECK-NEXT: addi a0, a0, %lo(.LCPI10_2)
187181
; CHECK-NEXT: vsetivli zero, 16, e64, m8, ta, ma
188-
; CHECK-NEXT: vle8.v v8, (a0)
189-
; CHECK-NEXT: vsext.vf8 v16, v8
190-
; CHECK-NEXT: vsaddu.vx v8, v16, a1
191-
; CHECK-NEXT: vmsltu.vx v16, v8, a2
192-
; CHECK-NEXT: vsetivli zero, 8, e8, m1, tu, ma
193-
; CHECK-NEXT: vslideup.vi v0, v16, 6
182+
; CHECK-NEXT: vle8.v v9, (a0)
183+
; CHECK-NEXT: vsext.vf8 v16, v9
184+
; CHECK-NEXT: vsaddu.vx v16, v16, a1
185+
; CHECK-NEXT: vmsltu.vx v9, v16, a2
186+
; CHECK-NEXT: vsetivli zero, 6, e8, mf2, tu, ma
187+
; CHECK-NEXT: vslideup.vi v8, v9, 4
194188
; CHECK-NEXT: lui a0, %hi(.LCPI10_3)
195189
; CHECK-NEXT: addi a0, a0, %lo(.LCPI10_3)
196190
; CHECK-NEXT: vsetivli zero, 16, e64, m8, ta, ma
197-
; CHECK-NEXT: vle8.v v8, (a0)
198-
; CHECK-NEXT: vsext.vf8 v16, v8
199-
; CHECK-NEXT: vsaddu.vx v8, v16, a1
200-
; CHECK-NEXT: vmsltu.vx v16, v8, a2
201-
; CHECK-NEXT: vsetivli zero, 10, e8, m1, tu, ma
202-
; CHECK-NEXT: vslideup.vi v0, v16, 8
191+
; CHECK-NEXT: vle8.v v9, (a0)
192+
; CHECK-NEXT: vsext.vf8 v16, v9
193+
; CHECK-NEXT: vsaddu.vx v16, v16, a1
194+
; CHECK-NEXT: vmsltu.vx v9, v16, a2
195+
; CHECK-NEXT: vsetivli zero, 8, e8, mf2, ta, ma
196+
; CHECK-NEXT: vslideup.vi v8, v9, 6
197+
; CHECK-NEXT: vsetivli zero, 16, e64, m8, ta, ma
203198
; CHECK-NEXT: lui a0, %hi(.LCPI10_4)
204199
; CHECK-NEXT: addi a0, a0, %lo(.LCPI10_4)
205-
; CHECK-NEXT: vsetivli zero, 16, e64, m8, ta, ma
206-
; CHECK-NEXT: vle8.v v8, (a0)
207-
; CHECK-NEXT: vsext.vf8 v16, v8
208-
; CHECK-NEXT: vsaddu.vx v8, v16, a1
209-
; CHECK-NEXT: vmsltu.vx v16, v8, a2
210-
; CHECK-NEXT: vsetivli zero, 12, e8, m1, tu, ma
211-
; CHECK-NEXT: vslideup.vi v0, v16, 10
200+
; CHECK-NEXT: vle8.v v9, (a0)
201+
; CHECK-NEXT: vid.v v16
202+
; CHECK-NEXT: vsaddu.vx v16, v16, a1
203+
; CHECK-NEXT: vmsltu.vx v0, v16, a2
204+
; CHECK-NEXT: vsext.vf8 v16, v9
205+
; CHECK-NEXT: vsaddu.vx v16, v16, a1
206+
; CHECK-NEXT: vmsltu.vx v9, v16, a2
207+
; CHECK-NEXT: vsetivli zero, 4, e8, mf2, tu, ma
208+
; CHECK-NEXT: vslideup.vi v0, v9, 2
212209
; CHECK-NEXT: lui a0, %hi(.LCPI10_5)
213210
; CHECK-NEXT: addi a0, a0, %lo(.LCPI10_5)
214211
; CHECK-NEXT: vsetivli zero, 16, e64, m8, ta, ma
215-
; CHECK-NEXT: vle8.v v8, (a0)
216-
; CHECK-NEXT: vsext.vf8 v16, v8
217-
; CHECK-NEXT: vsaddu.vx v8, v16, a1
218-
; CHECK-NEXT: vmsltu.vx v16, v8, a2
219-
; CHECK-NEXT: vsetivli zero, 14, e8, m1, tu, ma
220-
; CHECK-NEXT: vslideup.vi v0, v16, 12
212+
; CHECK-NEXT: vle8.v v9, (a0)
213+
; CHECK-NEXT: vsext.vf8 v16, v9
214+
; CHECK-NEXT: vsaddu.vx v16, v16, a1
215+
; CHECK-NEXT: vmsltu.vx v9, v16, a2
216+
; CHECK-NEXT: vsetivli zero, 6, e8, mf2, tu, ma
217+
; CHECK-NEXT: vslideup.vi v0, v9, 4
221218
; CHECK-NEXT: lui a0, %hi(.LCPI10_6)
222219
; CHECK-NEXT: addi a0, a0, %lo(.LCPI10_6)
223220
; CHECK-NEXT: vsetivli zero, 16, e64, m8, ta, ma
224-
; CHECK-NEXT: vle8.v v8, (a0)
225-
; CHECK-NEXT: vsext.vf8 v16, v8
226-
; CHECK-NEXT: vsaddu.vx v8, v16, a1
227-
; CHECK-NEXT: vmsltu.vx v16, v8, a2
228-
; CHECK-NEXT: vsetvli zero, zero, e8, m1, ta, ma
229-
; CHECK-NEXT: vslideup.vi v0, v16, 14
221+
; CHECK-NEXT: vle8.v v9, (a0)
222+
; CHECK-NEXT: vsext.vf8 v16, v9
223+
; CHECK-NEXT: vsaddu.vx v16, v16, a1
224+
; CHECK-NEXT: vmsltu.vx v9, v16, a2
225+
; CHECK-NEXT: vsetivli zero, 8, e8, mf2, ta, ma
226+
; CHECK-NEXT: vslideup.vi v0, v9, 6
227+
; CHECK-NEXT: vsetivli zero, 16, e8, m1, ta, ma
228+
; CHECK-NEXT: vslideup.vi v0, v8, 8
230229
; CHECK-NEXT: ret
231230
%mask = call <128 x i1> @llvm.get.active.lane.mask.v128i1.i64(i64 %index, i64 %tc)
232231
ret <128 x i1> %mask

llvm/test/CodeGen/RISCV/rvv/combine-store-extract-crash.ll

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -19,7 +19,7 @@ define void @test(ptr %ref_array, ptr %sad_array) {
1919
; RV32-NEXT: th.swia a0, (a1), 4, 0
2020
; RV32-NEXT: vsetivli zero, 4, e8, mf4, ta, ma
2121
; RV32-NEXT: vle8.v v10, (a3)
22-
; RV32-NEXT: vsetivli zero, 8, e8, m1, tu, ma
22+
; RV32-NEXT: vsetivli zero, 8, e8, mf2, ta, ma
2323
; RV32-NEXT: vslideup.vi v10, v9, 4
2424
; RV32-NEXT: vsetivli zero, 16, e32, m4, ta, ma
2525
; RV32-NEXT: vzext.vf4 v12, v10
@@ -42,7 +42,7 @@ define void @test(ptr %ref_array, ptr %sad_array) {
4242
; RV64-NEXT: th.swia a0, (a1), 4, 0
4343
; RV64-NEXT: vsetivli zero, 4, e8, mf4, ta, ma
4444
; RV64-NEXT: vle8.v v10, (a3)
45-
; RV64-NEXT: vsetivli zero, 8, e8, m1, tu, ma
45+
; RV64-NEXT: vsetivli zero, 8, e8, mf2, ta, ma
4646
; RV64-NEXT: vslideup.vi v10, v9, 4
4747
; RV64-NEXT: vsetivli zero, 16, e32, m4, ta, ma
4848
; RV64-NEXT: vzext.vf4 v12, v10

llvm/test/CodeGen/RISCV/rvv/extract-subvector.ll

Lines changed: 1 addition & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -469,9 +469,8 @@ define <vscale x 6 x half> @extract_nxv6f16_nxv12f16_6(<vscale x 12 x half> %in)
469469
; CHECK: # %bb.0:
470470
; CHECK-NEXT: csrr a0, vlenb
471471
; CHECK-NEXT: srli a0, a0, 2
472-
; CHECK-NEXT: vsetvli zero, a0, e16, m1, ta, ma
473-
; CHECK-NEXT: vslidedown.vx v13, v10, a0
474472
; CHECK-NEXT: vsetvli a1, zero, e16, m1, ta, ma
473+
; CHECK-NEXT: vslidedown.vx v13, v10, a0
475474
; CHECK-NEXT: vslidedown.vx v12, v9, a0
476475
; CHECK-NEXT: add a1, a0, a0
477476
; CHECK-NEXT: vsetvli zero, a1, e16, m1, ta, ma

0 commit comments

Comments
 (0)