Skip to content

Commit 0f44137

Browse files
committed
Limit splitting to MF2
1 parent 9284955 commit 0f44137

File tree

5 files changed

+254
-159
lines changed

5 files changed

+254
-159
lines changed

llvm/lib/Target/RISCV/RISCVISelLowering.cpp

Lines changed: 8 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -15288,7 +15288,7 @@ static SDValue performINSERT_VECTOR_ELTCombine(SDNode *N, SelectionDAG &DAG,
1528815288
// concat_vector (concat_vector op1, op2), (concat_vector op3, op4)
1528915289
//
1529015290
// This reduces the length of the chain of vslideups and allows us to perform
15291-
// the vslideups at a smaller LMUL.
15291+
// the vslideups at a smaller LMUL, limited to MF2.
1529215292
//
1529315293
// We do this as a DAG combine rather than during lowering so that any undef
1529415294
// operands can get combined away.
@@ -15304,6 +15304,13 @@ performCONCAT_VECTORSSplitCombine(SDNode *N, SelectionDAG &DAG,
1530415304
return SDValue();
1530515305
MVT VT = N->getSimpleValueType(0);
1530615306

15307+
// Don't split any further than MF2.
15308+
MVT ContainerVT = VT;
15309+
if (VT.isFixedLengthVector())
15310+
ContainerVT = getContainerForFixedLengthVector(DAG, VT, TLI.getSubtarget());
15311+
if (ContainerVT.bitsLT(getLMUL1VT(ContainerVT)))
15312+
return SDValue();
15313+
1530715314
MVT HalfVT = VT.getHalfNumVectorElementsVT();
1530815315
assert(isPowerOf2_32(N->getNumOperands()));
1530915316
size_t HalfNumOps = N->getNumOperands() / 2;

llvm/test/CodeGen/RISCV/rvv/active_lane_mask.ll

Lines changed: 53 additions & 50 deletions
Original file line numberDiff line numberDiff line change
@@ -123,35 +123,36 @@ define <32 x i1> @fv32(ptr %p, i64 %index, i64 %tc) {
123123
define <64 x i1> @fv64(ptr %p, i64 %index, i64 %tc) {
124124
; CHECK-LABEL: fv64:
125125
; CHECK: # %bb.0:
126+
; CHECK-NEXT: vsetivli zero, 16, e64, m8, ta, ma
126127
; CHECK-NEXT: lui a0, %hi(.LCPI9_0)
127128
; CHECK-NEXT: addi a0, a0, %lo(.LCPI9_0)
128-
; CHECK-NEXT: vsetivli zero, 16, e64, m8, ta, ma
129129
; CHECK-NEXT: vle8.v v8, (a0)
130+
; CHECK-NEXT: vid.v v16
131+
; CHECK-NEXT: vsaddu.vx v16, v16, a1
132+
; CHECK-NEXT: vmsltu.vx v0, v16, a2
133+
; CHECK-NEXT: vsext.vf8 v16, v8
134+
; CHECK-NEXT: vsaddu.vx v8, v16, a1
135+
; CHECK-NEXT: vmsltu.vx v16, v8, a2
136+
; CHECK-NEXT: vsetivli zero, 4, e8, mf2, tu, ma
137+
; CHECK-NEXT: vslideup.vi v0, v16, 2
130138
; CHECK-NEXT: lui a0, %hi(.LCPI9_1)
131139
; CHECK-NEXT: addi a0, a0, %lo(.LCPI9_1)
132-
; CHECK-NEXT: vle8.v v9, (a0)
133-
; CHECK-NEXT: vsext.vf8 v16, v8
134-
; CHECK-NEXT: vsaddu.vx v16, v16, a1
135-
; CHECK-NEXT: vmsltu.vx v8, v16, a2
136-
; CHECK-NEXT: vsext.vf8 v16, v9
137-
; CHECK-NEXT: vsaddu.vx v16, v16, a1
138-
; CHECK-NEXT: vmsltu.vx v9, v16, a2
139-
; CHECK-NEXT: vsetivli zero, 4, e8, mf4, ta, ma
140-
; CHECK-NEXT: vslideup.vi v9, v8, 2
141140
; CHECK-NEXT: vsetivli zero, 16, e64, m8, ta, ma
141+
; CHECK-NEXT: vle8.v v8, (a0)
142+
; CHECK-NEXT: vsext.vf8 v16, v8
143+
; CHECK-NEXT: vsaddu.vx v8, v16, a1
144+
; CHECK-NEXT: vmsltu.vx v16, v8, a2
145+
; CHECK-NEXT: vsetivli zero, 6, e8, mf2, tu, ma
146+
; CHECK-NEXT: vslideup.vi v0, v16, 4
142147
; CHECK-NEXT: lui a0, %hi(.LCPI9_2)
143148
; CHECK-NEXT: addi a0, a0, %lo(.LCPI9_2)
149+
; CHECK-NEXT: vsetivli zero, 16, e64, m8, ta, ma
144150
; CHECK-NEXT: vle8.v v8, (a0)
145-
; CHECK-NEXT: vid.v v16
146-
; CHECK-NEXT: vsaddu.vx v16, v16, a1
147-
; CHECK-NEXT: vmsltu.vx v0, v16, a2
148151
; CHECK-NEXT: vsext.vf8 v16, v8
149-
; CHECK-NEXT: vsaddu.vx v16, v16, a1
150-
; CHECK-NEXT: vmsltu.vx v8, v16, a2
151-
; CHECK-NEXT: vsetivli zero, 4, e8, mf4, ta, ma
152-
; CHECK-NEXT: vslideup.vi v0, v8, 2
152+
; CHECK-NEXT: vsaddu.vx v8, v16, a1
153+
; CHECK-NEXT: vmsltu.vx v16, v8, a2
153154
; CHECK-NEXT: vsetivli zero, 8, e8, mf2, ta, ma
154-
; CHECK-NEXT: vslideup.vi v0, v9, 4
155+
; CHECK-NEXT: vslideup.vi v0, v16, 6
155156
; CHECK-NEXT: ret
156157
%mask = call <64 x i1> @llvm.get.active.lane.mask.v64i1.i64(i64 %index, i64 %tc)
157158
ret <64 x i1> %mask
@@ -169,60 +170,62 @@ define <128 x i1> @fv128(ptr %p, i64 %index, i64 %tc) {
169170
; CHECK-NEXT: vle8.v v9, (a0)
170171
; CHECK-NEXT: vsext.vf8 v16, v8
171172
; CHECK-NEXT: vsaddu.vx v16, v16, a1
172-
; CHECK-NEXT: vmsltu.vx v8, v16, a2
173+
; CHECK-NEXT: vmsltu.vx v10, v16, a2
173174
; CHECK-NEXT: vsext.vf8 v16, v9
174175
; CHECK-NEXT: vsaddu.vx v16, v16, a1
175-
; CHECK-NEXT: vmsltu.vx v9, v16, a2
176-
; CHECK-NEXT: vsetivli zero, 4, e8, mf4, ta, ma
177-
; CHECK-NEXT: vslideup.vi v9, v8, 2
176+
; CHECK-NEXT: vmsltu.vx v8, v16, a2
177+
; CHECK-NEXT: vsetivli zero, 4, e8, mf2, tu, ma
178+
; CHECK-NEXT: vslideup.vi v8, v10, 2
178179
; CHECK-NEXT: lui a0, %hi(.LCPI10_2)
179180
; CHECK-NEXT: addi a0, a0, %lo(.LCPI10_2)
180181
; CHECK-NEXT: vsetivli zero, 16, e64, m8, ta, ma
181-
; CHECK-NEXT: vle8.v v8, (a0)
182+
; CHECK-NEXT: vle8.v v9, (a0)
183+
; CHECK-NEXT: vsext.vf8 v16, v9
184+
; CHECK-NEXT: vsaddu.vx v16, v16, a1
185+
; CHECK-NEXT: vmsltu.vx v9, v16, a2
186+
; CHECK-NEXT: vsetivli zero, 6, e8, mf2, tu, ma
187+
; CHECK-NEXT: vslideup.vi v8, v9, 4
182188
; CHECK-NEXT: lui a0, %hi(.LCPI10_3)
183189
; CHECK-NEXT: addi a0, a0, %lo(.LCPI10_3)
184-
; CHECK-NEXT: vle8.v v10, (a0)
185-
; CHECK-NEXT: vsext.vf8 v16, v8
186-
; CHECK-NEXT: vsaddu.vx v16, v16, a1
187-
; CHECK-NEXT: vmsltu.vx v8, v16, a2
188-
; CHECK-NEXT: vsext.vf8 v16, v10
190+
; CHECK-NEXT: vsetivli zero, 16, e64, m8, ta, ma
191+
; CHECK-NEXT: vle8.v v9, (a0)
192+
; CHECK-NEXT: vsext.vf8 v16, v9
189193
; CHECK-NEXT: vsaddu.vx v16, v16, a1
190-
; CHECK-NEXT: vmsltu.vx v10, v16, a2
191-
; CHECK-NEXT: vsetivli zero, 4, e8, mf4, ta, ma
192-
; CHECK-NEXT: vslideup.vi v10, v8, 2
194+
; CHECK-NEXT: vmsltu.vx v9, v16, a2
193195
; CHECK-NEXT: vsetivli zero, 8, e8, mf2, ta, ma
194-
; CHECK-NEXT: vslideup.vi v10, v9, 4
196+
; CHECK-NEXT: vslideup.vi v8, v9, 6
197+
; CHECK-NEXT: vsetivli zero, 16, e64, m8, ta, ma
195198
; CHECK-NEXT: lui a0, %hi(.LCPI10_4)
196199
; CHECK-NEXT: addi a0, a0, %lo(.LCPI10_4)
197-
; CHECK-NEXT: vsetivli zero, 16, e64, m8, ta, ma
198-
; CHECK-NEXT: vle8.v v8, (a0)
199-
; CHECK-NEXT: lui a0, %hi(.LCPI10_5)
200-
; CHECK-NEXT: addi a0, a0, %lo(.LCPI10_5)
201200
; CHECK-NEXT: vle8.v v9, (a0)
202-
; CHECK-NEXT: vsext.vf8 v16, v8
201+
; CHECK-NEXT: vid.v v16
203202
; CHECK-NEXT: vsaddu.vx v16, v16, a1
204-
; CHECK-NEXT: vmsltu.vx v8, v16, a2
203+
; CHECK-NEXT: vmsltu.vx v0, v16, a2
205204
; CHECK-NEXT: vsext.vf8 v16, v9
206205
; CHECK-NEXT: vsaddu.vx v16, v16, a1
207206
; CHECK-NEXT: vmsltu.vx v9, v16, a2
208-
; CHECK-NEXT: vsetivli zero, 4, e8, mf4, ta, ma
209-
; CHECK-NEXT: vslideup.vi v9, v8, 2
207+
; CHECK-NEXT: vsetivli zero, 4, e8, mf2, tu, ma
208+
; CHECK-NEXT: vslideup.vi v0, v9, 2
209+
; CHECK-NEXT: lui a0, %hi(.LCPI10_5)
210+
; CHECK-NEXT: addi a0, a0, %lo(.LCPI10_5)
210211
; CHECK-NEXT: vsetivli zero, 16, e64, m8, ta, ma
212+
; CHECK-NEXT: vle8.v v9, (a0)
213+
; CHECK-NEXT: vsext.vf8 v16, v9
214+
; CHECK-NEXT: vsaddu.vx v16, v16, a1
215+
; CHECK-NEXT: vmsltu.vx v9, v16, a2
216+
; CHECK-NEXT: vsetivli zero, 6, e8, mf2, tu, ma
217+
; CHECK-NEXT: vslideup.vi v0, v9, 4
211218
; CHECK-NEXT: lui a0, %hi(.LCPI10_6)
212219
; CHECK-NEXT: addi a0, a0, %lo(.LCPI10_6)
213-
; CHECK-NEXT: vle8.v v8, (a0)
214-
; CHECK-NEXT: vid.v v16
215-
; CHECK-NEXT: vsaddu.vx v16, v16, a1
216-
; CHECK-NEXT: vmsltu.vx v0, v16, a2
217-
; CHECK-NEXT: vsext.vf8 v16, v8
220+
; CHECK-NEXT: vsetivli zero, 16, e64, m8, ta, ma
221+
; CHECK-NEXT: vle8.v v9, (a0)
222+
; CHECK-NEXT: vsext.vf8 v16, v9
218223
; CHECK-NEXT: vsaddu.vx v16, v16, a1
219-
; CHECK-NEXT: vmsltu.vx v8, v16, a2
220-
; CHECK-NEXT: vsetivli zero, 4, e8, mf4, ta, ma
221-
; CHECK-NEXT: vslideup.vi v0, v8, 2
224+
; CHECK-NEXT: vmsltu.vx v9, v16, a2
222225
; CHECK-NEXT: vsetivli zero, 8, e8, mf2, ta, ma
223-
; CHECK-NEXT: vslideup.vi v0, v9, 4
226+
; CHECK-NEXT: vslideup.vi v0, v9, 6
224227
; CHECK-NEXT: vsetivli zero, 16, e8, m1, ta, ma
225-
; CHECK-NEXT: vslideup.vi v0, v10, 8
228+
; CHECK-NEXT: vslideup.vi v0, v8, 8
226229
; CHECK-NEXT: ret
227230
%mask = call <128 x i1> @llvm.get.active.lane.mask.v128i1.i64(i64 %index, i64 %tc)
228231
ret <128 x i1> %mask

llvm/test/CodeGen/RISCV/rvv/fixed-vectors-interleaved-access-zve32x.ll

Lines changed: 23 additions & 20 deletions
Original file line numberDiff line numberDiff line change
@@ -7,14 +7,14 @@
77
define <4 x i1> @load_large_vector(ptr %p) {
88
; ZVE32X-LABEL: load_large_vector:
99
; ZVE32X: # %bb.0:
10-
; ZVE32X-NEXT: ld a1, 8(a0)
11-
; ZVE32X-NEXT: ld a2, 0(a0)
12-
; ZVE32X-NEXT: ld a3, 32(a0)
13-
; ZVE32X-NEXT: ld a4, 80(a0)
14-
; ZVE32X-NEXT: ld a5, 72(a0)
15-
; ZVE32X-NEXT: ld a6, 24(a0)
16-
; ZVE32X-NEXT: ld a7, 56(a0)
17-
; ZVE32X-NEXT: ld a0, 48(a0)
10+
; ZVE32X-NEXT: ld a1, 80(a0)
11+
; ZVE32X-NEXT: ld a2, 72(a0)
12+
; ZVE32X-NEXT: ld a3, 56(a0)
13+
; ZVE32X-NEXT: ld a4, 32(a0)
14+
; ZVE32X-NEXT: ld a5, 24(a0)
15+
; ZVE32X-NEXT: ld a6, 48(a0)
16+
; ZVE32X-NEXT: ld a7, 8(a0)
17+
; ZVE32X-NEXT: ld a0, 0(a0)
1818
; ZVE32X-NEXT: xor a4, a5, a4
1919
; ZVE32X-NEXT: snez a4, a4
2020
; ZVE32X-NEXT: vsetivli zero, 1, e8, mf4, ta, ma
@@ -28,10 +28,12 @@ define <4 x i1> @load_large_vector(ptr %p) {
2828
; ZVE32X-NEXT: vmv.s.x v10, a0
2929
; ZVE32X-NEXT: vand.vi v10, v10, 1
3030
; ZVE32X-NEXT: vmsne.vi v0, v10, 0
31-
; ZVE32X-NEXT: vsetivli zero, 2, e8, mf4, ta, ma
31+
; ZVE32X-NEXT: vsetivli zero, 4, e8, mf4, ta, ma
3232
; ZVE32X-NEXT: vmv.v.i v10, 0
3333
; ZVE32X-NEXT: vmerge.vim v11, v10, 1, v0
34+
; ZVE32X-NEXT: vsetivli zero, 2, e8, mf4, tu, ma
3435
; ZVE32X-NEXT: vslideup.vi v11, v9, 1
36+
; ZVE32X-NEXT: vsetivli zero, 4, e8, mf4, ta, ma
3537
; ZVE32X-NEXT: vmsne.vi v0, v11, 0
3638
; ZVE32X-NEXT: vmerge.vim v9, v10, 1, v0
3739
; ZVE32X-NEXT: xor a0, a6, a3
@@ -40,21 +42,22 @@ define <4 x i1> @load_large_vector(ptr %p) {
4042
; ZVE32X-NEXT: vsetivli zero, 1, e8, mf4, ta, ma
4143
; ZVE32X-NEXT: vand.vi v11, v11, 1
4244
; ZVE32X-NEXT: vmsne.vi v0, v11, 0
43-
; ZVE32X-NEXT: vmerge.vim v8, v8, 1, v0
45+
; ZVE32X-NEXT: vmerge.vim v11, v8, 1, v0
46+
; ZVE32X-NEXT: vsetivli zero, 3, e8, mf4, tu, ma
47+
; ZVE32X-NEXT: vslideup.vi v9, v11, 2
48+
; ZVE32X-NEXT: vsetivli zero, 4, e8, mf4, ta, ma
49+
; ZVE32X-NEXT: vmsne.vi v0, v9, 0
50+
; ZVE32X-NEXT: vmerge.vim v9, v10, 1, v0
4451
; ZVE32X-NEXT: xor a1, a2, a1
4552
; ZVE32X-NEXT: snez a0, a1
46-
; ZVE32X-NEXT: vmv.s.x v11, a0
47-
; ZVE32X-NEXT: vand.vi v11, v11, 1
48-
; ZVE32X-NEXT: vmsne.vi v0, v11, 0
49-
; ZVE32X-NEXT: vsetivli zero, 2, e8, mf4, ta, ma
50-
; ZVE32X-NEXT: vmerge.vim v10, v10, 1, v0
51-
; ZVE32X-NEXT: vslideup.vi v10, v8, 1
53+
; ZVE32X-NEXT: vmv.s.x v10, a0
54+
; ZVE32X-NEXT: vsetivli zero, 1, e8, mf4, ta, ma
55+
; ZVE32X-NEXT: vand.vi v10, v10, 1
5256
; ZVE32X-NEXT: vmsne.vi v0, v10, 0
53-
; ZVE32X-NEXT: vsetivli zero, 4, e8, mf4, ta, ma
54-
; ZVE32X-NEXT: vmv.v.i v8, 0
5557
; ZVE32X-NEXT: vmerge.vim v8, v8, 1, v0
56-
; ZVE32X-NEXT: vslideup.vi v8, v9, 2
57-
; ZVE32X-NEXT: vmsne.vi v0, v8, 0
58+
; ZVE32X-NEXT: vsetivli zero, 4, e8, mf4, ta, ma
59+
; ZVE32X-NEXT: vslideup.vi v9, v8, 3
60+
; ZVE32X-NEXT: vmsne.vi v0, v9, 0
5861
; ZVE32X-NEXT: ret
5962
;
6063
; ZVE64X-LABEL: load_large_vector:

llvm/test/CodeGen/RISCV/rvv/fixed-vectors-shuffle-concat.ll

Lines changed: 53 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -5,6 +5,59 @@
55
; RUN: llc < %s -mtriple=riscv32 -mattr=+v -riscv-v-vector-bits-max=128 -verify-machineinstrs | FileCheck -check-prefixes=CHECK,VLS %s
66
; RUN: llc < %s -mtriple=riscv64 -mattr=+v -riscv-v-vector-bits-max=128 -verify-machineinstrs | FileCheck -check-prefixes=CHECK,VLS %s
77

8+
define <8 x i16> @concat_2xv4i16(<4 x i16> %a, <4 x i16> %b) {
9+
; CHECK-LABEL: concat_2xv4i16:
10+
; CHECK: # %bb.0:
11+
; CHECK-NEXT: vsetivli zero, 8, e16, m1, ta, ma
12+
; CHECK-NEXT: vslideup.vi v8, v9, 4
13+
; CHECK-NEXT: ret
14+
%ab = shufflevector <4 x i16> %a, <4 x i16> %b, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
15+
ret <8 x i16> %ab
16+
}
17+
18+
define <8 x i16> @concat_4xv2i16(<2 x i16> %a, <2 x i16> %b, <2 x i16> %c, <2 x i16> %d) {
19+
; CHECK-LABEL: concat_4xv2i16:
20+
; CHECK: # %bb.0:
21+
; CHECK-NEXT: vsetivli zero, 4, e16, mf2, ta, ma
22+
; CHECK-NEXT: vslideup.vi v10, v11, 2
23+
; CHECK-NEXT: vslideup.vi v8, v9, 2
24+
; CHECK-NEXT: vsetivli zero, 8, e16, m1, ta, ma
25+
; CHECK-NEXT: vslideup.vi v8, v10, 4
26+
; CHECK-NEXT: ret
27+
%ab = shufflevector <2 x i16> %a, <2 x i16> %b, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
28+
%cd = shufflevector <2 x i16> %c, <2 x i16> %d, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
29+
%abcd = shufflevector <4 x i16> %ab, <4 x i16> %cd, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
30+
ret <8 x i16> %abcd
31+
}
32+
33+
define <8 x i16> @concat_8xv1i16(<1 x i16> %a, <1 x i16> %b, <1 x i16> %c, <1 x i16> %d, <1 x i16> %e, <1 x i16> %f, <1 x i16> %g, <1 x i16> %h) {
34+
; CHECK-LABEL: concat_8xv1i16:
35+
; CHECK: # %bb.0:
36+
; CHECK-NEXT: vsetivli zero, 2, e16, mf2, tu, ma
37+
; CHECK-NEXT: vslideup.vi v12, v13, 1
38+
; CHECK-NEXT: vsetivli zero, 3, e16, mf2, tu, ma
39+
; CHECK-NEXT: vslideup.vi v12, v14, 2
40+
; CHECK-NEXT: vsetivli zero, 4, e16, mf2, ta, ma
41+
; CHECK-NEXT: vslideup.vi v12, v15, 3
42+
; CHECK-NEXT: vsetivli zero, 2, e16, mf2, tu, ma
43+
; CHECK-NEXT: vslideup.vi v8, v9, 1
44+
; CHECK-NEXT: vsetivli zero, 3, e16, mf2, tu, ma
45+
; CHECK-NEXT: vslideup.vi v8, v10, 2
46+
; CHECK-NEXT: vsetivli zero, 4, e16, mf2, ta, ma
47+
; CHECK-NEXT: vslideup.vi v8, v11, 3
48+
; CHECK-NEXT: vsetivli zero, 8, e16, m1, ta, ma
49+
; CHECK-NEXT: vslideup.vi v8, v12, 4
50+
; CHECK-NEXT: ret
51+
%ab = shufflevector <1 x i16> %a, <1 x i16> %b, <2 x i32> <i32 0, i32 1>
52+
%cd = shufflevector <1 x i16> %c, <1 x i16> %d, <2 x i32> <i32 0, i32 1>
53+
%abcd = shufflevector <2 x i16> %ab, <2 x i16> %cd, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
54+
%ef = shufflevector <1 x i16> %e, <1 x i16> %f, <2 x i32> <i32 0, i32 1>
55+
%gh = shufflevector <1 x i16> %g, <1 x i16> %h, <2 x i32> <i32 0, i32 1>
56+
%efgh = shufflevector <2 x i16> %ef, <2 x i16> %gh, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
57+
%abcdefgh = shufflevector <4 x i16> %abcd, <4 x i16> %efgh, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
58+
ret <8 x i16> %abcdefgh
59+
}
60+
861
define <8 x i32> @concat_2xv4i32(<4 x i32> %a, <4 x i32> %b) {
962
; CHECK-LABEL: concat_2xv4i32:
1063
; CHECK: # %bb.0:

0 commit comments

Comments
 (0)