Skip to content

Commit ef520ca

Browse files
committed
Revert "[RISCV] Recursively split concat_vector into smaller LMULs (#83035)"
This reverts commit c59129a. This causes regressions in some x264 workloads like pixel_var_8x8 due to it interfering with the strided load combine. Reverting so I can try to rework it as a lowering instead.
1 parent 9a3ece2 commit ef520ca

9 files changed

+781
-926
lines changed

llvm/lib/Target/RISCV/RISCVISelLowering.cpp

Lines changed: 4 additions & 56 deletions
Original file line numberDiff line numberDiff line change
@@ -15323,62 +15323,13 @@ static SDValue performINSERT_VECTOR_ELTCombine(SDNode *N, SelectionDAG &DAG,
1532315323
return DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, ConcatOps);
1532415324
}
1532515325

15326-
// Recursively split up concat_vectors with more than 2 operands:
15327-
//
15328-
// concat_vector op1, op2, op3, op4
15329-
// ->
15330-
// concat_vector (concat_vector op1, op2), (concat_vector op3, op4)
15331-
//
15332-
// This reduces the length of the chain of vslideups and allows us to perform
15333-
// the vslideups at a smaller LMUL, limited to MF2.
15334-
//
15335-
// We do this as a DAG combine rather than during lowering so that any undef
15336-
// operands can get combined away.
15337-
static SDValue
15338-
performCONCAT_VECTORSSplitCombine(SDNode *N, SelectionDAG &DAG,
15339-
const RISCVTargetLowering &TLI) {
15340-
SDLoc DL(N);
15341-
15342-
if (N->getNumOperands() <= 2)
15343-
return SDValue();
15344-
15345-
if (!TLI.isTypeLegal(N->getValueType(0)))
15346-
return SDValue();
15347-
MVT VT = N->getSimpleValueType(0);
15348-
15349-
// Don't split any further than MF2.
15350-
MVT ContainerVT = VT;
15351-
if (VT.isFixedLengthVector())
15352-
ContainerVT = getContainerForFixedLengthVector(DAG, VT, TLI.getSubtarget());
15353-
if (ContainerVT.bitsLT(getLMUL1VT(ContainerVT)))
15354-
return SDValue();
15355-
15356-
MVT HalfVT = VT.getHalfNumVectorElementsVT();
15357-
assert(isPowerOf2_32(N->getNumOperands()));
15358-
size_t HalfNumOps = N->getNumOperands() / 2;
15359-
SDValue Lo = DAG.getNode(ISD::CONCAT_VECTORS, DL, HalfVT,
15360-
N->ops().take_front(HalfNumOps));
15361-
SDValue Hi = DAG.getNode(ISD::CONCAT_VECTORS, DL, HalfVT,
15362-
N->ops().drop_front(HalfNumOps));
15363-
15364-
// Lower to an insert_subvector directly so the concat_vectors don't get
15365-
// recombined.
15366-
SDValue Vec = DAG.getNode(ISD::INSERT_SUBVECTOR, DL, VT, DAG.getUNDEF(VT), Lo,
15367-
DAG.getVectorIdxConstant(0, DL));
15368-
Vec = DAG.getNode(
15369-
ISD::INSERT_SUBVECTOR, DL, VT, Vec, Hi,
15370-
DAG.getVectorIdxConstant(HalfVT.getVectorMinNumElements(), DL));
15371-
return Vec;
15372-
}
15373-
1537415326
// If we're concatenating a series of vector loads like
1537515327
// concat_vectors (load v4i8, p+0), (load v4i8, p+n), (load v4i8, p+n*2) ...
1537615328
// Then we can turn this into a strided load by widening the vector elements
1537715329
// vlse32 p, stride=n
15378-
static SDValue
15379-
performCONCAT_VECTORSStridedLoadCombine(SDNode *N, SelectionDAG &DAG,
15380-
const RISCVSubtarget &Subtarget,
15381-
const RISCVTargetLowering &TLI) {
15330+
static SDValue performCONCAT_VECTORSCombine(SDNode *N, SelectionDAG &DAG,
15331+
const RISCVSubtarget &Subtarget,
15332+
const RISCVTargetLowering &TLI) {
1538215333
SDLoc DL(N);
1538315334
EVT VT = N->getValueType(0);
1538415335

@@ -16483,10 +16434,7 @@ SDValue RISCVTargetLowering::PerformDAGCombine(SDNode *N,
1648316434
return V;
1648416435
break;
1648516436
case ISD::CONCAT_VECTORS:
16486-
if (SDValue V =
16487-
performCONCAT_VECTORSStridedLoadCombine(N, DAG, Subtarget, *this))
16488-
return V;
16489-
if (SDValue V = performCONCAT_VECTORSSplitCombine(N, DAG, *this))
16437+
if (SDValue V = performCONCAT_VECTORSCombine(N, DAG, Subtarget, *this))
1649016438
return V;
1649116439
break;
1649216440
case ISD::INSERT_VECTOR_ELT:

llvm/test/CodeGen/RISCV/rvv/active_lane_mask.ll

Lines changed: 46 additions & 45 deletions
Original file line numberDiff line numberDiff line change
@@ -161,71 +161,72 @@ define <64 x i1> @fv64(ptr %p, i64 %index, i64 %tc) {
161161
define <128 x i1> @fv128(ptr %p, i64 %index, i64 %tc) {
162162
; CHECK-LABEL: fv128:
163163
; CHECK: # %bb.0:
164+
; CHECK-NEXT: vsetivli zero, 16, e64, m8, ta, ma
164165
; CHECK-NEXT: lui a0, %hi(.LCPI10_0)
165166
; CHECK-NEXT: addi a0, a0, %lo(.LCPI10_0)
166-
; CHECK-NEXT: vsetivli zero, 16, e64, m8, ta, ma
167167
; CHECK-NEXT: vle8.v v8, (a0)
168+
; CHECK-NEXT: vid.v v16
169+
; CHECK-NEXT: vsaddu.vx v16, v16, a1
170+
; CHECK-NEXT: vmsltu.vx v0, v16, a2
171+
; CHECK-NEXT: vsext.vf8 v16, v8
172+
; CHECK-NEXT: vsaddu.vx v8, v16, a1
173+
; CHECK-NEXT: vmsltu.vx v16, v8, a2
174+
; CHECK-NEXT: vsetivli zero, 4, e8, m1, tu, ma
175+
; CHECK-NEXT: vslideup.vi v0, v16, 2
168176
; CHECK-NEXT: lui a0, %hi(.LCPI10_1)
169177
; CHECK-NEXT: addi a0, a0, %lo(.LCPI10_1)
170-
; CHECK-NEXT: vle8.v v9, (a0)
178+
; CHECK-NEXT: vsetivli zero, 16, e64, m8, ta, ma
179+
; CHECK-NEXT: vle8.v v8, (a0)
171180
; CHECK-NEXT: vsext.vf8 v16, v8
172-
; CHECK-NEXT: vsaddu.vx v16, v16, a1
173-
; CHECK-NEXT: vmsltu.vx v10, v16, a2
174-
; CHECK-NEXT: vsext.vf8 v16, v9
175-
; CHECK-NEXT: vsaddu.vx v16, v16, a1
176-
; CHECK-NEXT: vmsltu.vx v8, v16, a2
177-
; CHECK-NEXT: vsetivli zero, 4, e8, mf2, tu, ma
178-
; CHECK-NEXT: vslideup.vi v8, v10, 2
181+
; CHECK-NEXT: vsaddu.vx v8, v16, a1
182+
; CHECK-NEXT: vmsltu.vx v16, v8, a2
183+
; CHECK-NEXT: vsetivli zero, 6, e8, m1, tu, ma
184+
; CHECK-NEXT: vslideup.vi v0, v16, 4
179185
; CHECK-NEXT: lui a0, %hi(.LCPI10_2)
180186
; CHECK-NEXT: addi a0, a0, %lo(.LCPI10_2)
181187
; CHECK-NEXT: vsetivli zero, 16, e64, m8, ta, ma
182-
; CHECK-NEXT: vle8.v v9, (a0)
183-
; CHECK-NEXT: vsext.vf8 v16, v9
184-
; CHECK-NEXT: vsaddu.vx v16, v16, a1
185-
; CHECK-NEXT: vmsltu.vx v9, v16, a2
186-
; CHECK-NEXT: vsetivli zero, 6, e8, mf2, tu, ma
187-
; CHECK-NEXT: vslideup.vi v8, v9, 4
188+
; CHECK-NEXT: vle8.v v8, (a0)
189+
; CHECK-NEXT: vsext.vf8 v16, v8
190+
; CHECK-NEXT: vsaddu.vx v8, v16, a1
191+
; CHECK-NEXT: vmsltu.vx v16, v8, a2
192+
; CHECK-NEXT: vsetivli zero, 8, e8, m1, tu, ma
193+
; CHECK-NEXT: vslideup.vi v0, v16, 6
188194
; CHECK-NEXT: lui a0, %hi(.LCPI10_3)
189195
; CHECK-NEXT: addi a0, a0, %lo(.LCPI10_3)
190196
; CHECK-NEXT: vsetivli zero, 16, e64, m8, ta, ma
191-
; CHECK-NEXT: vle8.v v9, (a0)
192-
; CHECK-NEXT: vsext.vf8 v16, v9
193-
; CHECK-NEXT: vsaddu.vx v16, v16, a1
194-
; CHECK-NEXT: vmsltu.vx v9, v16, a2
195-
; CHECK-NEXT: vsetivli zero, 8, e8, mf2, ta, ma
196-
; CHECK-NEXT: vslideup.vi v8, v9, 6
197-
; CHECK-NEXT: vsetivli zero, 16, e64, m8, ta, ma
197+
; CHECK-NEXT: vle8.v v8, (a0)
198+
; CHECK-NEXT: vsext.vf8 v16, v8
199+
; CHECK-NEXT: vsaddu.vx v8, v16, a1
200+
; CHECK-NEXT: vmsltu.vx v16, v8, a2
201+
; CHECK-NEXT: vsetivli zero, 10, e8, m1, tu, ma
202+
; CHECK-NEXT: vslideup.vi v0, v16, 8
198203
; CHECK-NEXT: lui a0, %hi(.LCPI10_4)
199204
; CHECK-NEXT: addi a0, a0, %lo(.LCPI10_4)
200-
; CHECK-NEXT: vle8.v v9, (a0)
201-
; CHECK-NEXT: vid.v v16
202-
; CHECK-NEXT: vsaddu.vx v16, v16, a1
203-
; CHECK-NEXT: vmsltu.vx v0, v16, a2
204-
; CHECK-NEXT: vsext.vf8 v16, v9
205-
; CHECK-NEXT: vsaddu.vx v16, v16, a1
206-
; CHECK-NEXT: vmsltu.vx v9, v16, a2
207-
; CHECK-NEXT: vsetivli zero, 4, e8, mf2, tu, ma
208-
; CHECK-NEXT: vslideup.vi v0, v9, 2
205+
; CHECK-NEXT: vsetivli zero, 16, e64, m8, ta, ma
206+
; CHECK-NEXT: vle8.v v8, (a0)
207+
; CHECK-NEXT: vsext.vf8 v16, v8
208+
; CHECK-NEXT: vsaddu.vx v8, v16, a1
209+
; CHECK-NEXT: vmsltu.vx v16, v8, a2
210+
; CHECK-NEXT: vsetivli zero, 12, e8, m1, tu, ma
211+
; CHECK-NEXT: vslideup.vi v0, v16, 10
209212
; CHECK-NEXT: lui a0, %hi(.LCPI10_5)
210213
; CHECK-NEXT: addi a0, a0, %lo(.LCPI10_5)
211214
; CHECK-NEXT: vsetivli zero, 16, e64, m8, ta, ma
212-
; CHECK-NEXT: vle8.v v9, (a0)
213-
; CHECK-NEXT: vsext.vf8 v16, v9
214-
; CHECK-NEXT: vsaddu.vx v16, v16, a1
215-
; CHECK-NEXT: vmsltu.vx v9, v16, a2
216-
; CHECK-NEXT: vsetivli zero, 6, e8, mf2, tu, ma
217-
; CHECK-NEXT: vslideup.vi v0, v9, 4
215+
; CHECK-NEXT: vle8.v v8, (a0)
216+
; CHECK-NEXT: vsext.vf8 v16, v8
217+
; CHECK-NEXT: vsaddu.vx v8, v16, a1
218+
; CHECK-NEXT: vmsltu.vx v16, v8, a2
219+
; CHECK-NEXT: vsetivli zero, 14, e8, m1, tu, ma
220+
; CHECK-NEXT: vslideup.vi v0, v16, 12
218221
; CHECK-NEXT: lui a0, %hi(.LCPI10_6)
219222
; CHECK-NEXT: addi a0, a0, %lo(.LCPI10_6)
220223
; CHECK-NEXT: vsetivli zero, 16, e64, m8, ta, ma
221-
; CHECK-NEXT: vle8.v v9, (a0)
222-
; CHECK-NEXT: vsext.vf8 v16, v9
223-
; CHECK-NEXT: vsaddu.vx v16, v16, a1
224-
; CHECK-NEXT: vmsltu.vx v9, v16, a2
225-
; CHECK-NEXT: vsetivli zero, 8, e8, mf2, ta, ma
226-
; CHECK-NEXT: vslideup.vi v0, v9, 6
227-
; CHECK-NEXT: vsetivli zero, 16, e8, m1, ta, ma
228-
; CHECK-NEXT: vslideup.vi v0, v8, 8
224+
; CHECK-NEXT: vle8.v v8, (a0)
225+
; CHECK-NEXT: vsext.vf8 v16, v8
226+
; CHECK-NEXT: vsaddu.vx v8, v16, a1
227+
; CHECK-NEXT: vmsltu.vx v16, v8, a2
228+
; CHECK-NEXT: vsetvli zero, zero, e8, m1, ta, ma
229+
; CHECK-NEXT: vslideup.vi v0, v16, 14
229230
; CHECK-NEXT: ret
230231
%mask = call <128 x i1> @llvm.get.active.lane.mask.v128i1.i64(i64 %index, i64 %tc)
231232
ret <128 x i1> %mask

llvm/test/CodeGen/RISCV/rvv/combine-store-extract-crash.ll

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -19,7 +19,7 @@ define void @test(ptr %ref_array, ptr %sad_array) {
1919
; RV32-NEXT: th.swia a0, (a1), 4, 0
2020
; RV32-NEXT: vsetivli zero, 4, e8, mf4, ta, ma
2121
; RV32-NEXT: vle8.v v10, (a3)
22-
; RV32-NEXT: vsetivli zero, 8, e8, mf2, ta, ma
22+
; RV32-NEXT: vsetivli zero, 8, e8, m1, tu, ma
2323
; RV32-NEXT: vslideup.vi v10, v9, 4
2424
; RV32-NEXT: vsetivli zero, 16, e32, m4, ta, ma
2525
; RV32-NEXT: vzext.vf4 v12, v10
@@ -42,7 +42,7 @@ define void @test(ptr %ref_array, ptr %sad_array) {
4242
; RV64-NEXT: th.swia a0, (a1), 4, 0
4343
; RV64-NEXT: vsetivli zero, 4, e8, mf4, ta, ma
4444
; RV64-NEXT: vle8.v v10, (a3)
45-
; RV64-NEXT: vsetivli zero, 8, e8, mf2, ta, ma
45+
; RV64-NEXT: vsetivli zero, 8, e8, m1, tu, ma
4646
; RV64-NEXT: vslideup.vi v10, v9, 4
4747
; RV64-NEXT: vsetivli zero, 16, e32, m4, ta, ma
4848
; RV64-NEXT: vzext.vf4 v12, v10

llvm/test/CodeGen/RISCV/rvv/extract-subvector.ll

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -469,8 +469,9 @@ define <vscale x 6 x half> @extract_nxv6f16_nxv12f16_6(<vscale x 12 x half> %in)
469469
; CHECK: # %bb.0:
470470
; CHECK-NEXT: csrr a0, vlenb
471471
; CHECK-NEXT: srli a0, a0, 2
472-
; CHECK-NEXT: vsetvli a1, zero, e16, m1, ta, ma
472+
; CHECK-NEXT: vsetvli zero, a0, e16, m1, ta, ma
473473
; CHECK-NEXT: vslidedown.vx v13, v10, a0
474+
; CHECK-NEXT: vsetvli a1, zero, e16, m1, ta, ma
474475
; CHECK-NEXT: vslidedown.vx v12, v9, a0
475476
; CHECK-NEXT: add a1, a0, a0
476477
; CHECK-NEXT: vsetvli zero, a1, e16, m1, ta, ma

0 commit comments

Comments
 (0)