Skip to content

Commit adda09e

Browse files
committed
[RISCV] Recursively split concat_vector into smaller LMULs
This is the concat_vector equivalent of llvm#81312, in that we recursively split concat_vectors with more than two operands into smaller concat_vectors. This allows us to break up the chain of vslideups, as well as perform the vslideups at a smaller LMUL, which in turn reduces register pressure as the previous lowering performed N vslideups at the highest result LMUL. This is done as a DAG combine so that any undef operands are combined away: If we do this during lowering then we end up with unnecessary vslideups of undefs.
1 parent 7a0e222 commit adda09e

11 files changed

+985
-936
lines changed

llvm/lib/Target/RISCV/RISCVISelLowering.cpp

Lines changed: 48 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -15283,13 +15283,54 @@ static SDValue performINSERT_VECTOR_ELTCombine(SDNode *N, SelectionDAG &DAG,
1528315283
return DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, ConcatOps);
1528415284
}
1528515285

15286+
// Recursively split up concat_vectors with more than 2 operands:
15287+
//
15288+
// concat_vector op1, op2, op3, op4
15289+
// ->
15290+
// concat_vector (concat_vector op1, op2), (concat_vector op3, op4)
15291+
//
15292+
// This reduces the length of the chain of vslideups and allows us to perform
15293+
// the vslideups at a smaller LMUL.
15294+
//
15295+
// We do this as a DAG combine rather than during lowering so that any undef
15296+
// operands can get combined away.
15297+
static SDValue
15298+
performCONCAT_VECTORSSplitCombine(SDNode *N, SelectionDAG &DAG,
15299+
const RISCVTargetLowering &TLI) {
15300+
SDLoc DL(N);
15301+
15302+
if (N->getNumOperands() <= 2)
15303+
return SDValue();
15304+
15305+
if (!TLI.isTypeLegal(N->getValueType(0)))
15306+
return SDValue();
15307+
MVT VT = N->getSimpleValueType(0);
15308+
15309+
MVT HalfVT = VT.getHalfNumVectorElementsVT();
15310+
size_t HalfNumOps = (N->getNumOperands() + 1) / 2;
15311+
SDValue BotSubConcat = DAG.getNode(ISD::CONCAT_VECTORS, DL, HalfVT,
15312+
N->ops().take_front(HalfNumOps));
15313+
SDValue TopSubConcat = DAG.getNode(ISD::CONCAT_VECTORS, DL, HalfVT,
15314+
N->ops().drop_front(HalfNumOps));
15315+
15316+
// Lower to an insert_subvector directly so the concat_vectors don't get
15317+
// recombined.
15318+
SDValue Vec = DAG.getNode(ISD::INSERT_SUBVECTOR, DL, VT, DAG.getUNDEF(VT),
15319+
BotSubConcat, DAG.getVectorIdxConstant(0, DL));
15320+
Vec = DAG.getNode(
15321+
ISD::INSERT_SUBVECTOR, DL, VT, Vec, TopSubConcat,
15322+
DAG.getVectorIdxConstant(HalfVT.getVectorMinNumElements(), DL));
15323+
return Vec;
15324+
}
15325+
1528615326
// If we're concatenating a series of vector loads like
1528715327
// concat_vectors (load v4i8, p+0), (load v4i8, p+n), (load v4i8, p+n*2) ...
1528815328
// Then we can turn this into a strided load by widening the vector elements
1528915329
// vlse32 p, stride=n
15290-
static SDValue performCONCAT_VECTORSCombine(SDNode *N, SelectionDAG &DAG,
15291-
const RISCVSubtarget &Subtarget,
15292-
const RISCVTargetLowering &TLI) {
15330+
static SDValue
15331+
performCONCAT_VECTORSStridedLoadCombine(SDNode *N, SelectionDAG &DAG,
15332+
const RISCVSubtarget &Subtarget,
15333+
const RISCVTargetLowering &TLI) {
1529315334
SDLoc DL(N);
1529415335
EVT VT = N->getValueType(0);
1529515336

@@ -16394,7 +16435,10 @@ SDValue RISCVTargetLowering::PerformDAGCombine(SDNode *N,
1639416435
return V;
1639516436
break;
1639616437
case ISD::CONCAT_VECTORS:
16397-
if (SDValue V = performCONCAT_VECTORSCombine(N, DAG, Subtarget, *this))
16438+
if (SDValue V =
16439+
performCONCAT_VECTORSStridedLoadCombine(N, DAG, Subtarget, *this))
16440+
return V;
16441+
if (SDValue V = performCONCAT_VECTORSSplitCombine(N, DAG, *this))
1639816442
return V;
1639916443
break;
1640016444
case ISD::INSERT_VECTOR_ELT:

llvm/test/CodeGen/RISCV/rvv/active_lane_mask.ll

Lines changed: 57 additions & 61 deletions
Original file line numberDiff line numberDiff line change
@@ -123,36 +123,35 @@ define <32 x i1> @fv32(ptr %p, i64 %index, i64 %tc) {
123123
define <64 x i1> @fv64(ptr %p, i64 %index, i64 %tc) {
124124
; CHECK-LABEL: fv64:
125125
; CHECK: # %bb.0:
126-
; CHECK-NEXT: vsetivli zero, 16, e64, m8, ta, ma
127126
; CHECK-NEXT: lui a0, %hi(.LCPI9_0)
128127
; CHECK-NEXT: addi a0, a0, %lo(.LCPI9_0)
128+
; CHECK-NEXT: vsetivli zero, 16, e64, m8, ta, ma
129129
; CHECK-NEXT: vle8.v v8, (a0)
130-
; CHECK-NEXT: vid.v v16
131-
; CHECK-NEXT: vsaddu.vx v16, v16, a1
132-
; CHECK-NEXT: vmsltu.vx v0, v16, a2
133-
; CHECK-NEXT: vsext.vf8 v16, v8
134-
; CHECK-NEXT: vsaddu.vx v8, v16, a1
135-
; CHECK-NEXT: vmsltu.vx v16, v8, a2
136-
; CHECK-NEXT: vsetivli zero, 4, e8, mf2, tu, ma
137-
; CHECK-NEXT: vslideup.vi v0, v16, 2
138130
; CHECK-NEXT: lui a0, %hi(.LCPI9_1)
139131
; CHECK-NEXT: addi a0, a0, %lo(.LCPI9_1)
140-
; CHECK-NEXT: vsetivli zero, 16, e64, m8, ta, ma
141-
; CHECK-NEXT: vle8.v v8, (a0)
132+
; CHECK-NEXT: vle8.v v9, (a0)
142133
; CHECK-NEXT: vsext.vf8 v16, v8
143-
; CHECK-NEXT: vsaddu.vx v8, v16, a1
144-
; CHECK-NEXT: vmsltu.vx v16, v8, a2
145-
; CHECK-NEXT: vsetivli zero, 6, e8, mf2, tu, ma
146-
; CHECK-NEXT: vslideup.vi v0, v16, 4
134+
; CHECK-NEXT: vsaddu.vx v16, v16, a1
135+
; CHECK-NEXT: vmsltu.vx v8, v16, a2
136+
; CHECK-NEXT: vsext.vf8 v16, v9
137+
; CHECK-NEXT: vsaddu.vx v16, v16, a1
138+
; CHECK-NEXT: vmsltu.vx v9, v16, a2
139+
; CHECK-NEXT: vsetivli zero, 4, e8, mf4, ta, ma
140+
; CHECK-NEXT: vslideup.vi v9, v8, 2
141+
; CHECK-NEXT: vsetivli zero, 16, e64, m8, ta, ma
147142
; CHECK-NEXT: lui a0, %hi(.LCPI9_2)
148143
; CHECK-NEXT: addi a0, a0, %lo(.LCPI9_2)
149-
; CHECK-NEXT: vsetivli zero, 16, e64, m8, ta, ma
150144
; CHECK-NEXT: vle8.v v8, (a0)
145+
; CHECK-NEXT: vid.v v16
146+
; CHECK-NEXT: vsaddu.vx v16, v16, a1
147+
; CHECK-NEXT: vmsltu.vx v0, v16, a2
151148
; CHECK-NEXT: vsext.vf8 v16, v8
152-
; CHECK-NEXT: vsaddu.vx v8, v16, a1
153-
; CHECK-NEXT: vmsltu.vx v16, v8, a2
149+
; CHECK-NEXT: vsaddu.vx v16, v16, a1
150+
; CHECK-NEXT: vmsltu.vx v8, v16, a2
151+
; CHECK-NEXT: vsetivli zero, 4, e8, mf4, ta, ma
152+
; CHECK-NEXT: vslideup.vi v0, v8, 2
154153
; CHECK-NEXT: vsetivli zero, 8, e8, mf2, ta, ma
155-
; CHECK-NEXT: vslideup.vi v0, v16, 6
154+
; CHECK-NEXT: vslideup.vi v0, v9, 4
156155
; CHECK-NEXT: ret
157156
%mask = call <64 x i1> @llvm.get.active.lane.mask.v64i1.i64(i64 %index, i64 %tc)
158157
ret <64 x i1> %mask
@@ -161,72 +160,69 @@ define <64 x i1> @fv64(ptr %p, i64 %index, i64 %tc) {
161160
define <128 x i1> @fv128(ptr %p, i64 %index, i64 %tc) {
162161
; CHECK-LABEL: fv128:
163162
; CHECK: # %bb.0:
164-
; CHECK-NEXT: vsetivli zero, 16, e64, m8, ta, ma
165163
; CHECK-NEXT: lui a0, %hi(.LCPI10_0)
166164
; CHECK-NEXT: addi a0, a0, %lo(.LCPI10_0)
165+
; CHECK-NEXT: vsetivli zero, 16, e64, m8, ta, ma
167166
; CHECK-NEXT: vle8.v v8, (a0)
168-
; CHECK-NEXT: vid.v v16
169-
; CHECK-NEXT: vsaddu.vx v16, v16, a1
170-
; CHECK-NEXT: vmsltu.vx v0, v16, a2
171-
; CHECK-NEXT: vsext.vf8 v16, v8
172-
; CHECK-NEXT: vsaddu.vx v8, v16, a1
173-
; CHECK-NEXT: vmsltu.vx v16, v8, a2
174-
; CHECK-NEXT: vsetivli zero, 4, e8, m1, tu, ma
175-
; CHECK-NEXT: vslideup.vi v0, v16, 2
176167
; CHECK-NEXT: lui a0, %hi(.LCPI10_1)
177168
; CHECK-NEXT: addi a0, a0, %lo(.LCPI10_1)
178-
; CHECK-NEXT: vsetivli zero, 16, e64, m8, ta, ma
179-
; CHECK-NEXT: vle8.v v8, (a0)
169+
; CHECK-NEXT: vle8.v v9, (a0)
180170
; CHECK-NEXT: vsext.vf8 v16, v8
181-
; CHECK-NEXT: vsaddu.vx v8, v16, a1
182-
; CHECK-NEXT: vmsltu.vx v16, v8, a2
183-
; CHECK-NEXT: vsetivli zero, 6, e8, m1, tu, ma
184-
; CHECK-NEXT: vslideup.vi v0, v16, 4
171+
; CHECK-NEXT: vsaddu.vx v16, v16, a1
172+
; CHECK-NEXT: vmsltu.vx v8, v16, a2
173+
; CHECK-NEXT: vsext.vf8 v16, v9
174+
; CHECK-NEXT: vsaddu.vx v16, v16, a1
175+
; CHECK-NEXT: vmsltu.vx v9, v16, a2
176+
; CHECK-NEXT: vsetivli zero, 4, e8, mf4, ta, ma
177+
; CHECK-NEXT: vslideup.vi v9, v8, 2
185178
; CHECK-NEXT: lui a0, %hi(.LCPI10_2)
186179
; CHECK-NEXT: addi a0, a0, %lo(.LCPI10_2)
187180
; CHECK-NEXT: vsetivli zero, 16, e64, m8, ta, ma
188181
; CHECK-NEXT: vle8.v v8, (a0)
189-
; CHECK-NEXT: vsext.vf8 v16, v8
190-
; CHECK-NEXT: vsaddu.vx v8, v16, a1
191-
; CHECK-NEXT: vmsltu.vx v16, v8, a2
192-
; CHECK-NEXT: vsetivli zero, 8, e8, m1, tu, ma
193-
; CHECK-NEXT: vslideup.vi v0, v16, 6
194182
; CHECK-NEXT: lui a0, %hi(.LCPI10_3)
195183
; CHECK-NEXT: addi a0, a0, %lo(.LCPI10_3)
196-
; CHECK-NEXT: vsetivli zero, 16, e64, m8, ta, ma
197-
; CHECK-NEXT: vle8.v v8, (a0)
184+
; CHECK-NEXT: vle8.v v10, (a0)
198185
; CHECK-NEXT: vsext.vf8 v16, v8
199-
; CHECK-NEXT: vsaddu.vx v8, v16, a1
200-
; CHECK-NEXT: vmsltu.vx v16, v8, a2
201-
; CHECK-NEXT: vsetivli zero, 10, e8, m1, tu, ma
202-
; CHECK-NEXT: vslideup.vi v0, v16, 8
186+
; CHECK-NEXT: vsaddu.vx v16, v16, a1
187+
; CHECK-NEXT: vmsltu.vx v8, v16, a2
188+
; CHECK-NEXT: vsext.vf8 v16, v10
189+
; CHECK-NEXT: vsaddu.vx v16, v16, a1
190+
; CHECK-NEXT: vmsltu.vx v10, v16, a2
191+
; CHECK-NEXT: vsetivli zero, 4, e8, mf4, ta, ma
192+
; CHECK-NEXT: vslideup.vi v10, v8, 2
193+
; CHECK-NEXT: vsetivli zero, 8, e8, mf2, ta, ma
194+
; CHECK-NEXT: vslideup.vi v10, v9, 4
203195
; CHECK-NEXT: lui a0, %hi(.LCPI10_4)
204196
; CHECK-NEXT: addi a0, a0, %lo(.LCPI10_4)
205197
; CHECK-NEXT: vsetivli zero, 16, e64, m8, ta, ma
206198
; CHECK-NEXT: vle8.v v8, (a0)
207-
; CHECK-NEXT: vsext.vf8 v16, v8
208-
; CHECK-NEXT: vsaddu.vx v8, v16, a1
209-
; CHECK-NEXT: vmsltu.vx v16, v8, a2
210-
; CHECK-NEXT: vsetivli zero, 12, e8, m1, tu, ma
211-
; CHECK-NEXT: vslideup.vi v0, v16, 10
212199
; CHECK-NEXT: lui a0, %hi(.LCPI10_5)
213200
; CHECK-NEXT: addi a0, a0, %lo(.LCPI10_5)
214-
; CHECK-NEXT: vsetivli zero, 16, e64, m8, ta, ma
215-
; CHECK-NEXT: vle8.v v8, (a0)
201+
; CHECK-NEXT: vle8.v v9, (a0)
216202
; CHECK-NEXT: vsext.vf8 v16, v8
217-
; CHECK-NEXT: vsaddu.vx v8, v16, a1
218-
; CHECK-NEXT: vmsltu.vx v16, v8, a2
219-
; CHECK-NEXT: vsetivli zero, 14, e8, m1, tu, ma
220-
; CHECK-NEXT: vslideup.vi v0, v16, 12
203+
; CHECK-NEXT: vsaddu.vx v16, v16, a1
204+
; CHECK-NEXT: vmsltu.vx v8, v16, a2
205+
; CHECK-NEXT: vsext.vf8 v16, v9
206+
; CHECK-NEXT: vsaddu.vx v16, v16, a1
207+
; CHECK-NEXT: vmsltu.vx v9, v16, a2
208+
; CHECK-NEXT: vsetivli zero, 4, e8, mf4, ta, ma
209+
; CHECK-NEXT: vslideup.vi v9, v8, 2
210+
; CHECK-NEXT: vsetivli zero, 16, e64, m8, ta, ma
221211
; CHECK-NEXT: lui a0, %hi(.LCPI10_6)
222212
; CHECK-NEXT: addi a0, a0, %lo(.LCPI10_6)
223-
; CHECK-NEXT: vsetivli zero, 16, e64, m8, ta, ma
224213
; CHECK-NEXT: vle8.v v8, (a0)
214+
; CHECK-NEXT: vid.v v16
215+
; CHECK-NEXT: vsaddu.vx v16, v16, a1
216+
; CHECK-NEXT: vmsltu.vx v0, v16, a2
225217
; CHECK-NEXT: vsext.vf8 v16, v8
226-
; CHECK-NEXT: vsaddu.vx v8, v16, a1
227-
; CHECK-NEXT: vmsltu.vx v16, v8, a2
228-
; CHECK-NEXT: vsetvli zero, zero, e8, m1, ta, ma
229-
; CHECK-NEXT: vslideup.vi v0, v16, 14
218+
; CHECK-NEXT: vsaddu.vx v16, v16, a1
219+
; CHECK-NEXT: vmsltu.vx v8, v16, a2
220+
; CHECK-NEXT: vsetivli zero, 4, e8, mf4, ta, ma
221+
; CHECK-NEXT: vslideup.vi v0, v8, 2
222+
; CHECK-NEXT: vsetivli zero, 8, e8, mf2, ta, ma
223+
; CHECK-NEXT: vslideup.vi v0, v9, 4
224+
; CHECK-NEXT: vsetivli zero, 16, e8, m1, ta, ma
225+
; CHECK-NEXT: vslideup.vi v0, v10, 8
230226
; CHECK-NEXT: ret
231227
%mask = call <128 x i1> @llvm.get.active.lane.mask.v128i1.i64(i64 %index, i64 %tc)
232228
ret <128 x i1> %mask

llvm/test/CodeGen/RISCV/rvv/combine-store-extract-crash.ll

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -19,7 +19,7 @@ define void @test(ptr %ref_array, ptr %sad_array) {
1919
; RV32-NEXT: th.swia a0, (a1), 4, 0
2020
; RV32-NEXT: vsetivli zero, 4, e8, mf4, ta, ma
2121
; RV32-NEXT: vle8.v v10, (a3)
22-
; RV32-NEXT: vsetivli zero, 8, e8, m1, tu, ma
22+
; RV32-NEXT: vsetivli zero, 8, e8, mf2, ta, ma
2323
; RV32-NEXT: vslideup.vi v10, v9, 4
2424
; RV32-NEXT: vsetivli zero, 16, e32, m4, ta, ma
2525
; RV32-NEXT: vzext.vf4 v12, v10
@@ -42,7 +42,7 @@ define void @test(ptr %ref_array, ptr %sad_array) {
4242
; RV64-NEXT: th.swia a0, (a1), 4, 0
4343
; RV64-NEXT: vsetivli zero, 4, e8, mf4, ta, ma
4444
; RV64-NEXT: vle8.v v10, (a3)
45-
; RV64-NEXT: vsetivli zero, 8, e8, m1, tu, ma
45+
; RV64-NEXT: vsetivli zero, 8, e8, mf2, ta, ma
4646
; RV64-NEXT: vslideup.vi v10, v9, 4
4747
; RV64-NEXT: vsetivli zero, 16, e32, m4, ta, ma
4848
; RV64-NEXT: vzext.vf4 v12, v10

llvm/test/CodeGen/RISCV/rvv/extract-subvector.ll

Lines changed: 1 addition & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -469,9 +469,8 @@ define <vscale x 6 x half> @extract_nxv6f16_nxv12f16_6(<vscale x 12 x half> %in)
469469
; CHECK: # %bb.0:
470470
; CHECK-NEXT: csrr a0, vlenb
471471
; CHECK-NEXT: srli a0, a0, 2
472-
; CHECK-NEXT: vsetvli zero, a0, e16, m1, ta, ma
473-
; CHECK-NEXT: vslidedown.vx v13, v10, a0
474472
; CHECK-NEXT: vsetvli a1, zero, e16, m1, ta, ma
473+
; CHECK-NEXT: vslidedown.vx v13, v10, a0
475474
; CHECK-NEXT: vslidedown.vx v12, v9, a0
476475
; CHECK-NEXT: add a1, a0, a0
477476
; CHECK-NEXT: vsetvli zero, a1, e16, m1, ta, ma

llvm/test/CodeGen/RISCV/rvv/fixed-vectors-interleaved-access-zve32x.ll

Lines changed: 20 additions & 23 deletions
Original file line numberDiff line numberDiff line change
@@ -7,14 +7,14 @@
77
define <4 x i1> @load_large_vector(ptr %p) {
88
; ZVE32X-LABEL: load_large_vector:
99
; ZVE32X: # %bb.0:
10-
; ZVE32X-NEXT: ld a1, 80(a0)
11-
; ZVE32X-NEXT: ld a2, 72(a0)
12-
; ZVE32X-NEXT: ld a3, 56(a0)
13-
; ZVE32X-NEXT: ld a4, 32(a0)
14-
; ZVE32X-NEXT: ld a5, 24(a0)
15-
; ZVE32X-NEXT: ld a6, 48(a0)
16-
; ZVE32X-NEXT: ld a7, 8(a0)
17-
; ZVE32X-NEXT: ld a0, 0(a0)
10+
; ZVE32X-NEXT: ld a1, 8(a0)
11+
; ZVE32X-NEXT: ld a2, 0(a0)
12+
; ZVE32X-NEXT: ld a3, 32(a0)
13+
; ZVE32X-NEXT: ld a4, 80(a0)
14+
; ZVE32X-NEXT: ld a5, 72(a0)
15+
; ZVE32X-NEXT: ld a6, 24(a0)
16+
; ZVE32X-NEXT: ld a7, 56(a0)
17+
; ZVE32X-NEXT: ld a0, 48(a0)
1818
; ZVE32X-NEXT: xor a4, a5, a4
1919
; ZVE32X-NEXT: snez a4, a4
2020
; ZVE32X-NEXT: vsetivli zero, 1, e8, mf4, ta, ma
@@ -28,12 +28,10 @@ define <4 x i1> @load_large_vector(ptr %p) {
2828
; ZVE32X-NEXT: vmv.s.x v10, a0
2929
; ZVE32X-NEXT: vand.vi v10, v10, 1
3030
; ZVE32X-NEXT: vmsne.vi v0, v10, 0
31-
; ZVE32X-NEXT: vsetivli zero, 4, e8, mf4, ta, ma
31+
; ZVE32X-NEXT: vsetivli zero, 2, e8, mf4, ta, ma
3232
; ZVE32X-NEXT: vmv.v.i v10, 0
3333
; ZVE32X-NEXT: vmerge.vim v11, v10, 1, v0
34-
; ZVE32X-NEXT: vsetivli zero, 2, e8, mf4, tu, ma
3534
; ZVE32X-NEXT: vslideup.vi v11, v9, 1
36-
; ZVE32X-NEXT: vsetivli zero, 4, e8, mf4, ta, ma
3735
; ZVE32X-NEXT: vmsne.vi v0, v11, 0
3836
; ZVE32X-NEXT: vmerge.vim v9, v10, 1, v0
3937
; ZVE32X-NEXT: xor a0, a6, a3
@@ -42,22 +40,21 @@ define <4 x i1> @load_large_vector(ptr %p) {
4240
; ZVE32X-NEXT: vsetivli zero, 1, e8, mf4, ta, ma
4341
; ZVE32X-NEXT: vand.vi v11, v11, 1
4442
; ZVE32X-NEXT: vmsne.vi v0, v11, 0
45-
; ZVE32X-NEXT: vmerge.vim v11, v8, 1, v0
46-
; ZVE32X-NEXT: vsetivli zero, 3, e8, mf4, tu, ma
47-
; ZVE32X-NEXT: vslideup.vi v9, v11, 2
48-
; ZVE32X-NEXT: vsetivli zero, 4, e8, mf4, ta, ma
49-
; ZVE32X-NEXT: vmsne.vi v0, v9, 0
50-
; ZVE32X-NEXT: vmerge.vim v9, v10, 1, v0
43+
; ZVE32X-NEXT: vmerge.vim v8, v8, 1, v0
5144
; ZVE32X-NEXT: xor a1, a2, a1
5245
; ZVE32X-NEXT: snez a0, a1
53-
; ZVE32X-NEXT: vmv.s.x v10, a0
54-
; ZVE32X-NEXT: vsetivli zero, 1, e8, mf4, ta, ma
55-
; ZVE32X-NEXT: vand.vi v10, v10, 1
46+
; ZVE32X-NEXT: vmv.s.x v11, a0
47+
; ZVE32X-NEXT: vand.vi v11, v11, 1
48+
; ZVE32X-NEXT: vmsne.vi v0, v11, 0
49+
; ZVE32X-NEXT: vsetivli zero, 2, e8, mf4, ta, ma
50+
; ZVE32X-NEXT: vmerge.vim v10, v10, 1, v0
51+
; ZVE32X-NEXT: vslideup.vi v10, v8, 1
5652
; ZVE32X-NEXT: vmsne.vi v0, v10, 0
57-
; ZVE32X-NEXT: vmerge.vim v8, v8, 1, v0
5853
; ZVE32X-NEXT: vsetivli zero, 4, e8, mf4, ta, ma
59-
; ZVE32X-NEXT: vslideup.vi v9, v8, 3
60-
; ZVE32X-NEXT: vmsne.vi v0, v9, 0
54+
; ZVE32X-NEXT: vmv.v.i v8, 0
55+
; ZVE32X-NEXT: vmerge.vim v8, v8, 1, v0
56+
; ZVE32X-NEXT: vslideup.vi v8, v9, 2
57+
; ZVE32X-NEXT: vmsne.vi v0, v8, 0
6158
; ZVE32X-NEXT: ret
6259
;
6360
; ZVE64X-LABEL: load_large_vector:

0 commit comments

Comments
 (0)