Skip to content

Commit c663401

Browse files
committed
[RISCV] Prefer vrgatherei16 for shuffles (#66291)
If the data type is larger than e16, and the requires more than LMUL1 register class, prefer the use of vrgatherei16. This has three major benefits: 1) Less work needed to evaluate the constant for e.g. vid sequences. Remember that arithmetic generally scales lineary with LMUL. 2) Less register pressure. In particular, the source and indices registers *can* overlap so using a smaller index can significantly help at m8. 3) Smaller constants. We've got a bunch of tricks for materializing small constants, and if needed, can use a EEW=16 load.
1 parent ff2622b commit c663401

11 files changed

+835
-1457
lines changed

llvm/lib/Target/RISCV/RISCVISelLowering.cpp

Lines changed: 9 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -4663,6 +4663,15 @@ static SDValue lowerVECTOR_SHUFFLE(SDValue Op, SelectionDAG &DAG,
46634663
IndexVT = IndexVT.changeVectorElementType(MVT::i16);
46644664
}
46654665

4666+
// If the mask allows, we can do all the index computation in 16 bits. This
4667+
// requires less work and less register pressure at high LMUL, and creates
4668+
// smaller constants which may be cheaper to materialize.
4669+
if (IndexVT.getScalarType().bitsGT(MVT::i16) && isUInt<16>(NumElts - 1) &&
4670+
(IndexVT.getSizeInBits() / Subtarget.getRealMinVLen()) > 1) {
4671+
GatherVVOpc = RISCVISD::VRGATHEREI16_VV_VL;
4672+
IndexVT = IndexVT.changeVectorElementType(MVT::i16);
4673+
}
4674+
46664675
MVT IndexContainerVT =
46674676
ContainerVT.changeVectorElementType(IndexVT.getScalarType());
46684677

llvm/test/CodeGen/RISCV/rvv/fixed-vectors-fp-buildvec.ll

Lines changed: 6 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -33,16 +33,17 @@ define void @buildvec_no_vid_v4f32(<4 x float>* %x) {
3333
define <4 x float> @hang_when_merging_stores_after_legalization(<8 x float> %x, <8 x float> %y) optsize {
3434
; CHECK-LABEL: hang_when_merging_stores_after_legalization:
3535
; CHECK: # %bb.0:
36-
; CHECK-NEXT: vsetivli zero, 8, e32, m2, ta, ma
36+
; CHECK-NEXT: vsetivli zero, 8, e16, m1, ta, ma
3737
; CHECK-NEXT: vid.v v12
3838
; CHECK-NEXT: li a0, 7
3939
; CHECK-NEXT: vmul.vx v14, v12, a0
40-
; CHECK-NEXT: vrgather.vv v12, v8, v14
40+
; CHECK-NEXT: vsetvli zero, zero, e32, m2, ta, ma
41+
; CHECK-NEXT: vrgatherei16.vv v12, v8, v14
42+
; CHECK-NEXT: vsetvli zero, zero, e16, m1, ta, ma
4143
; CHECK-NEXT: vadd.vi v8, v14, -14
42-
; CHECK-NEXT: vsetivli zero, 1, e8, mf8, ta, ma
4344
; CHECK-NEXT: vmv.v.i v0, 12
44-
; CHECK-NEXT: vsetivli zero, 8, e32, m2, ta, mu
45-
; CHECK-NEXT: vrgather.vv v12, v10, v8, v0.t
45+
; CHECK-NEXT: vsetvli zero, zero, e32, m2, ta, mu
46+
; CHECK-NEXT: vrgatherei16.vv v12, v10, v8, v0.t
4647
; CHECK-NEXT: vmv1r.v v8, v12
4748
; CHECK-NEXT: ret
4849
%z = shufflevector <8 x float> %x, <8 x float> %y, <4 x i32> <i32 0, i32 7, i32 8, i32 15>

llvm/test/CodeGen/RISCV/rvv/fixed-vectors-fp-interleave.ll

Lines changed: 47 additions & 103 deletions
Original file line numberDiff line numberDiff line change
@@ -36,34 +36,20 @@ define <4 x float> @interleave_v2f32(<2 x float> %x, <2 x float> %y) {
3636
; One vXf64 test case to very that we don't optimize it.
3737
; FIXME: Is there better codegen we can do here?
3838
define <4 x double> @interleave_v2f64(<2 x double> %x, <2 x double> %y) {
39-
; RV32-V128-LABEL: interleave_v2f64:
40-
; RV32-V128: # %bb.0:
41-
; RV32-V128-NEXT: vmv1r.v v12, v9
42-
; RV32-V128-NEXT: vsetivli zero, 4, e16, mf2, ta, ma
43-
; RV32-V128-NEXT: vid.v v9
44-
; RV32-V128-NEXT: vsrl.vi v14, v9, 1
45-
; RV32-V128-NEXT: vsetvli zero, zero, e64, m2, ta, ma
46-
; RV32-V128-NEXT: vrgatherei16.vv v10, v8, v14
47-
; RV32-V128-NEXT: vsetivli zero, 1, e8, mf8, ta, ma
48-
; RV32-V128-NEXT: vmv.v.i v0, 10
49-
; RV32-V128-NEXT: vsetivli zero, 4, e64, m2, ta, mu
50-
; RV32-V128-NEXT: vrgatherei16.vv v10, v12, v14, v0.t
51-
; RV32-V128-NEXT: vmv.v.v v8, v10
52-
; RV32-V128-NEXT: ret
53-
;
54-
; RV64-V128-LABEL: interleave_v2f64:
55-
; RV64-V128: # %bb.0:
56-
; RV64-V128-NEXT: vmv1r.v v12, v9
57-
; RV64-V128-NEXT: vsetivli zero, 4, e64, m2, ta, ma
58-
; RV64-V128-NEXT: vid.v v10
59-
; RV64-V128-NEXT: vsrl.vi v14, v10, 1
60-
; RV64-V128-NEXT: vrgather.vv v10, v8, v14
61-
; RV64-V128-NEXT: vsetivli zero, 1, e8, mf8, ta, ma
62-
; RV64-V128-NEXT: vmv.v.i v0, 10
63-
; RV64-V128-NEXT: vsetivli zero, 4, e64, m2, ta, mu
64-
; RV64-V128-NEXT: vrgather.vv v10, v12, v14, v0.t
65-
; RV64-V128-NEXT: vmv.v.v v8, v10
66-
; RV64-V128-NEXT: ret
39+
; V128-LABEL: interleave_v2f64:
40+
; V128: # %bb.0:
41+
; V128-NEXT: vmv1r.v v12, v9
42+
; V128-NEXT: vsetivli zero, 4, e16, mf2, ta, ma
43+
; V128-NEXT: vid.v v9
44+
; V128-NEXT: vsrl.vi v14, v9, 1
45+
; V128-NEXT: vsetvli zero, zero, e64, m2, ta, ma
46+
; V128-NEXT: vrgatherei16.vv v10, v8, v14
47+
; V128-NEXT: vsetivli zero, 1, e8, mf8, ta, ma
48+
; V128-NEXT: vmv.v.i v0, 10
49+
; V128-NEXT: vsetivli zero, 4, e64, m2, ta, mu
50+
; V128-NEXT: vrgatherei16.vv v10, v12, v14, v0.t
51+
; V128-NEXT: vmv.v.v v8, v10
52+
; V128-NEXT: ret
6753
;
6854
; RV32-V512-LABEL: interleave_v2f64:
6955
; RV32-V512: # %bb.0:
@@ -255,56 +241,34 @@ define <64 x float> @interleave_v32f32(<32 x float> %x, <32 x float> %y) {
255241
; RV32-V128-NEXT: addi sp, sp, -16
256242
; RV32-V128-NEXT: .cfi_def_cfa_offset 16
257243
; RV32-V128-NEXT: csrr a0, vlenb
258-
; RV32-V128-NEXT: li a1, 24
259-
; RV32-V128-NEXT: mul a0, a0, a1
244+
; RV32-V128-NEXT: slli a0, a0, 2
260245
; RV32-V128-NEXT: sub sp, sp, a0
261-
; RV32-V128-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x18, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 24 * vlenb
262-
; RV32-V128-NEXT: csrr a0, vlenb
263-
; RV32-V128-NEXT: slli a0, a0, 3
264-
; RV32-V128-NEXT: add a0, sp, a0
265-
; RV32-V128-NEXT: addi a0, a0, 16
266-
; RV32-V128-NEXT: vs8r.v v16, (a0) # Unknown-size Folded Spill
267-
; RV32-V128-NEXT: addi a0, sp, 16
268-
; RV32-V128-NEXT: vs8r.v v8, (a0) # Unknown-size Folded Spill
246+
; RV32-V128-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x04, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 4 * vlenb
269247
; RV32-V128-NEXT: lui a0, %hi(.LCPI10_0)
270248
; RV32-V128-NEXT: addi a0, a0, %lo(.LCPI10_0)
271249
; RV32-V128-NEXT: li a1, 32
272250
; RV32-V128-NEXT: vsetvli zero, a1, e32, m8, ta, mu
273-
; RV32-V128-NEXT: vle32.v v24, (a0)
251+
; RV32-V128-NEXT: vle16.v v4, (a0)
274252
; RV32-V128-NEXT: lui a0, %hi(.LCPI10_1)
275253
; RV32-V128-NEXT: addi a0, a0, %lo(.LCPI10_1)
276-
; RV32-V128-NEXT: vle32.v v16, (a0)
277-
; RV32-V128-NEXT: csrr a0, vlenb
278-
; RV32-V128-NEXT: slli a0, a0, 4
279-
; RV32-V128-NEXT: add a0, sp, a0
280-
; RV32-V128-NEXT: addi a0, a0, 16
281-
; RV32-V128-NEXT: vs8r.v v16, (a0) # Unknown-size Folded Spill
254+
; RV32-V128-NEXT: vle16.v v24, (a0)
255+
; RV32-V128-NEXT: addi a0, sp, 16
256+
; RV32-V128-NEXT: vs4r.v v24, (a0) # Unknown-size Folded Spill
282257
; RV32-V128-NEXT: lui a0, 699051
283258
; RV32-V128-NEXT: addi a0, a0, -1366
284259
; RV32-V128-NEXT: vmv.s.x v0, a0
285-
; RV32-V128-NEXT: vrgather.vv v16, v8, v24
286-
; RV32-V128-NEXT: csrr a0, vlenb
287-
; RV32-V128-NEXT: slli a0, a0, 4
288-
; RV32-V128-NEXT: add a0, sp, a0
289-
; RV32-V128-NEXT: addi a0, a0, 16
290-
; RV32-V128-NEXT: vl8r.v v24, (a0) # Unknown-size Folded Reload
291-
; RV32-V128-NEXT: csrr a0, vlenb
292-
; RV32-V128-NEXT: slli a0, a0, 3
293-
; RV32-V128-NEXT: add a0, sp, a0
294-
; RV32-V128-NEXT: addi a0, a0, 16
295-
; RV32-V128-NEXT: vl8r.v v8, (a0) # Unknown-size Folded Reload
296-
; RV32-V128-NEXT: vrgather.vv v16, v8, v24, v0.t
297-
; RV32-V128-NEXT: vsetivli zero, 16, e32, m4, ta, ma
298-
; RV32-V128-NEXT: vmv4r.v v24, v8
260+
; RV32-V128-NEXT: vrgatherei16.vv v24, v8, v4
299261
; RV32-V128-NEXT: addi a0, sp, 16
300-
; RV32-V128-NEXT: vl8r.v v8, (a0) # Unknown-size Folded Reload
301-
; RV32-V128-NEXT: vwaddu.vv v0, v8, v24
262+
; RV32-V128-NEXT: vl4r.v v12, (a0) # Unknown-size Folded Reload
263+
; RV32-V128-NEXT: vrgatherei16.vv v24, v16, v12, v0.t
264+
; RV32-V128-NEXT: vsetivli zero, 16, e32, m4, ta, ma
265+
; RV32-V128-NEXT: vwaddu.vv v0, v8, v16
302266
; RV32-V128-NEXT: li a0, -1
303-
; RV32-V128-NEXT: vwmaccu.vx v0, a0, v24
267+
; RV32-V128-NEXT: vwmaccu.vx v0, a0, v16
304268
; RV32-V128-NEXT: vmv8r.v v8, v0
269+
; RV32-V128-NEXT: vmv8r.v v16, v24
305270
; RV32-V128-NEXT: csrr a0, vlenb
306-
; RV32-V128-NEXT: li a1, 24
307-
; RV32-V128-NEXT: mul a0, a0, a1
271+
; RV32-V128-NEXT: slli a0, a0, 2
308272
; RV32-V128-NEXT: add sp, sp, a0
309273
; RV32-V128-NEXT: addi sp, sp, 16
310274
; RV32-V128-NEXT: ret
@@ -314,56 +278,34 @@ define <64 x float> @interleave_v32f32(<32 x float> %x, <32 x float> %y) {
314278
; RV64-V128-NEXT: addi sp, sp, -16
315279
; RV64-V128-NEXT: .cfi_def_cfa_offset 16
316280
; RV64-V128-NEXT: csrr a0, vlenb
317-
; RV64-V128-NEXT: li a1, 24
318-
; RV64-V128-NEXT: mul a0, a0, a1
281+
; RV64-V128-NEXT: slli a0, a0, 2
319282
; RV64-V128-NEXT: sub sp, sp, a0
320-
; RV64-V128-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x18, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 24 * vlenb
321-
; RV64-V128-NEXT: csrr a0, vlenb
322-
; RV64-V128-NEXT: slli a0, a0, 3
323-
; RV64-V128-NEXT: add a0, sp, a0
324-
; RV64-V128-NEXT: addi a0, a0, 16
325-
; RV64-V128-NEXT: vs8r.v v16, (a0) # Unknown-size Folded Spill
326-
; RV64-V128-NEXT: addi a0, sp, 16
327-
; RV64-V128-NEXT: vs8r.v v8, (a0) # Unknown-size Folded Spill
283+
; RV64-V128-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x04, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 4 * vlenb
328284
; RV64-V128-NEXT: lui a0, %hi(.LCPI10_0)
329285
; RV64-V128-NEXT: addi a0, a0, %lo(.LCPI10_0)
330286
; RV64-V128-NEXT: li a1, 32
331287
; RV64-V128-NEXT: vsetvli zero, a1, e32, m8, ta, mu
332-
; RV64-V128-NEXT: vle32.v v24, (a0)
288+
; RV64-V128-NEXT: vle16.v v4, (a0)
333289
; RV64-V128-NEXT: lui a0, %hi(.LCPI10_1)
334290
; RV64-V128-NEXT: addi a0, a0, %lo(.LCPI10_1)
335-
; RV64-V128-NEXT: vle32.v v16, (a0)
336-
; RV64-V128-NEXT: csrr a0, vlenb
337-
; RV64-V128-NEXT: slli a0, a0, 4
338-
; RV64-V128-NEXT: add a0, sp, a0
339-
; RV64-V128-NEXT: addi a0, a0, 16
340-
; RV64-V128-NEXT: vs8r.v v16, (a0) # Unknown-size Folded Spill
291+
; RV64-V128-NEXT: vle16.v v24, (a0)
292+
; RV64-V128-NEXT: addi a0, sp, 16
293+
; RV64-V128-NEXT: vs4r.v v24, (a0) # Unknown-size Folded Spill
341294
; RV64-V128-NEXT: lui a0, 699051
342295
; RV64-V128-NEXT: addiw a0, a0, -1366
343296
; RV64-V128-NEXT: vmv.s.x v0, a0
344-
; RV64-V128-NEXT: vrgather.vv v16, v8, v24
345-
; RV64-V128-NEXT: csrr a0, vlenb
346-
; RV64-V128-NEXT: slli a0, a0, 4
347-
; RV64-V128-NEXT: add a0, sp, a0
348-
; RV64-V128-NEXT: addi a0, a0, 16
349-
; RV64-V128-NEXT: vl8r.v v24, (a0) # Unknown-size Folded Reload
350-
; RV64-V128-NEXT: csrr a0, vlenb
351-
; RV64-V128-NEXT: slli a0, a0, 3
352-
; RV64-V128-NEXT: add a0, sp, a0
353-
; RV64-V128-NEXT: addi a0, a0, 16
354-
; RV64-V128-NEXT: vl8r.v v8, (a0) # Unknown-size Folded Reload
355-
; RV64-V128-NEXT: vrgather.vv v16, v8, v24, v0.t
356-
; RV64-V128-NEXT: vsetivli zero, 16, e32, m4, ta, ma
357-
; RV64-V128-NEXT: vmv4r.v v24, v8
297+
; RV64-V128-NEXT: vrgatherei16.vv v24, v8, v4
358298
; RV64-V128-NEXT: addi a0, sp, 16
359-
; RV64-V128-NEXT: vl8r.v v8, (a0) # Unknown-size Folded Reload
360-
; RV64-V128-NEXT: vwaddu.vv v0, v8, v24
299+
; RV64-V128-NEXT: vl4r.v v12, (a0) # Unknown-size Folded Reload
300+
; RV64-V128-NEXT: vrgatherei16.vv v24, v16, v12, v0.t
301+
; RV64-V128-NEXT: vsetivli zero, 16, e32, m4, ta, ma
302+
; RV64-V128-NEXT: vwaddu.vv v0, v8, v16
361303
; RV64-V128-NEXT: li a0, -1
362-
; RV64-V128-NEXT: vwmaccu.vx v0, a0, v24
304+
; RV64-V128-NEXT: vwmaccu.vx v0, a0, v16
363305
; RV64-V128-NEXT: vmv8r.v v8, v0
306+
; RV64-V128-NEXT: vmv8r.v v16, v24
364307
; RV64-V128-NEXT: csrr a0, vlenb
365-
; RV64-V128-NEXT: li a1, 24
366-
; RV64-V128-NEXT: mul a0, a0, a1
308+
; RV64-V128-NEXT: slli a0, a0, 2
367309
; RV64-V128-NEXT: add sp, sp, a0
368310
; RV64-V128-NEXT: addi sp, sp, 16
369311
; RV64-V128-NEXT: ret
@@ -450,10 +392,12 @@ define <4 x double> @unary_interleave_v4f64(<4 x double> %x) {
450392
; RV64-V128: # %bb.0:
451393
; RV64-V128-NEXT: lui a0, 12304
452394
; RV64-V128-NEXT: addiw a0, a0, 512
453-
; RV64-V128-NEXT: vsetivli zero, 4, e64, m2, ta, ma
395+
; RV64-V128-NEXT: vsetivli zero, 4, e32, m1, ta, ma
454396
; RV64-V128-NEXT: vmv.s.x v10, a0
455-
; RV64-V128-NEXT: vsext.vf8 v12, v10
456-
; RV64-V128-NEXT: vrgather.vv v10, v8, v12
397+
; RV64-V128-NEXT: vsetvli zero, zero, e16, mf2, ta, ma
398+
; RV64-V128-NEXT: vsext.vf2 v12, v10
399+
; RV64-V128-NEXT: vsetvli zero, zero, e64, m2, ta, ma
400+
; RV64-V128-NEXT: vrgatherei16.vv v10, v8, v12
457401
; RV64-V128-NEXT: vmv.v.v v8, v10
458402
; RV64-V128-NEXT: ret
459403
;

llvm/test/CodeGen/RISCV/rvv/fixed-vectors-fp-shuffles.ll

Lines changed: 42 additions & 68 deletions
Original file line numberDiff line numberDiff line change
@@ -72,10 +72,12 @@ define <4 x double> @vrgather_permute_shuffle_vu_v4f64(<4 x double> %x) {
7272
; RV64: # %bb.0:
7373
; RV64-NEXT: lui a0, 4096
7474
; RV64-NEXT: addiw a0, a0, 513
75-
; RV64-NEXT: vsetivli zero, 4, e64, m2, ta, ma
75+
; RV64-NEXT: vsetivli zero, 4, e32, m1, ta, ma
7676
; RV64-NEXT: vmv.s.x v10, a0
77-
; RV64-NEXT: vsext.vf8 v12, v10
78-
; RV64-NEXT: vrgather.vv v10, v8, v12
77+
; RV64-NEXT: vsetvli zero, zero, e16, mf2, ta, ma
78+
; RV64-NEXT: vsext.vf2 v12, v10
79+
; RV64-NEXT: vsetvli zero, zero, e64, m2, ta, ma
80+
; RV64-NEXT: vrgatherei16.vv v10, v8, v12
7981
; RV64-NEXT: vmv.v.v v8, v10
8082
; RV64-NEXT: ret
8183
%s = shufflevector <4 x double> %x, <4 x double> poison, <4 x i32> <i32 1, i32 2, i32 0, i32 1>
@@ -100,77 +102,50 @@ define <4 x double> @vrgather_permute_shuffle_uv_v4f64(<4 x double> %x) {
100102
; RV64: # %bb.0:
101103
; RV64-NEXT: lui a0, 4096
102104
; RV64-NEXT: addiw a0, a0, 513
103-
; RV64-NEXT: vsetivli zero, 4, e64, m2, ta, ma
105+
; RV64-NEXT: vsetivli zero, 4, e32, m1, ta, ma
104106
; RV64-NEXT: vmv.s.x v10, a0
105-
; RV64-NEXT: vsext.vf8 v12, v10
106-
; RV64-NEXT: vrgather.vv v10, v8, v12
107+
; RV64-NEXT: vsetvli zero, zero, e16, mf2, ta, ma
108+
; RV64-NEXT: vsext.vf2 v12, v10
109+
; RV64-NEXT: vsetvli zero, zero, e64, m2, ta, ma
110+
; RV64-NEXT: vrgatherei16.vv v10, v8, v12
107111
; RV64-NEXT: vmv.v.v v8, v10
108112
; RV64-NEXT: ret
109113
%s = shufflevector <4 x double> poison, <4 x double> %x, <4 x i32> <i32 5, i32 6, i32 4, i32 5>
110114
ret <4 x double> %s
111115
}
112116

113117
define <4 x double> @vrgather_shuffle_vv_v4f64(<4 x double> %x, <4 x double> %y) {
114-
; RV32-LABEL: vrgather_shuffle_vv_v4f64:
115-
; RV32: # %bb.0:
116-
; RV32-NEXT: lui a0, %hi(.LCPI6_0)
117-
; RV32-NEXT: addi a0, a0, %lo(.LCPI6_0)
118-
; RV32-NEXT: vsetivli zero, 4, e64, m2, ta, ma
119-
; RV32-NEXT: vle16.v v14, (a0)
120-
; RV32-NEXT: vrgatherei16.vv v12, v8, v14
121-
; RV32-NEXT: vsetivli zero, 1, e8, mf8, ta, ma
122-
; RV32-NEXT: vmv.v.i v0, 8
123-
; RV32-NEXT: vsetivli zero, 4, e64, m2, ta, mu
124-
; RV32-NEXT: vrgather.vi v12, v10, 1, v0.t
125-
; RV32-NEXT: vmv.v.v v8, v12
126-
; RV32-NEXT: ret
127-
;
128-
; RV64-LABEL: vrgather_shuffle_vv_v4f64:
129-
; RV64: # %bb.0:
130-
; RV64-NEXT: lui a0, %hi(.LCPI6_0)
131-
; RV64-NEXT: addi a0, a0, %lo(.LCPI6_0)
132-
; RV64-NEXT: vsetivli zero, 4, e64, m2, ta, ma
133-
; RV64-NEXT: vle64.v v14, (a0)
134-
; RV64-NEXT: vrgather.vv v12, v8, v14
135-
; RV64-NEXT: vsetivli zero, 1, e8, mf8, ta, ma
136-
; RV64-NEXT: vmv.v.i v0, 8
137-
; RV64-NEXT: vsetivli zero, 4, e64, m2, ta, mu
138-
; RV64-NEXT: vrgather.vi v12, v10, 1, v0.t
139-
; RV64-NEXT: vmv.v.v v8, v12
140-
; RV64-NEXT: ret
118+
; CHECK-LABEL: vrgather_shuffle_vv_v4f64:
119+
; CHECK: # %bb.0:
120+
; CHECK-NEXT: lui a0, %hi(.LCPI6_0)
121+
; CHECK-NEXT: addi a0, a0, %lo(.LCPI6_0)
122+
; CHECK-NEXT: vsetivli zero, 4, e64, m2, ta, ma
123+
; CHECK-NEXT: vle16.v v14, (a0)
124+
; CHECK-NEXT: vrgatherei16.vv v12, v8, v14
125+
; CHECK-NEXT: vsetivli zero, 1, e8, mf8, ta, ma
126+
; CHECK-NEXT: vmv.v.i v0, 8
127+
; CHECK-NEXT: vsetivli zero, 4, e64, m2, ta, mu
128+
; CHECK-NEXT: vrgather.vi v12, v10, 1, v0.t
129+
; CHECK-NEXT: vmv.v.v v8, v12
130+
; CHECK-NEXT: ret
141131
%s = shufflevector <4 x double> %x, <4 x double> %y, <4 x i32> <i32 1, i32 2, i32 0, i32 5>
142132
ret <4 x double> %s
143133
}
144134

145135
define <4 x double> @vrgather_shuffle_xv_v4f64(<4 x double> %x) {
146-
; RV32-LABEL: vrgather_shuffle_xv_v4f64:
147-
; RV32: # %bb.0:
148-
; RV32-NEXT: vsetivli zero, 4, e16, mf2, ta, ma
149-
; RV32-NEXT: vid.v v12
150-
; RV32-NEXT: lui a0, %hi(.LCPI7_0)
151-
; RV32-NEXT: addi a0, a0, %lo(.LCPI7_0)
152-
; RV32-NEXT: vlse64.v v10, (a0), zero
153-
; RV32-NEXT: vrsub.vi v12, v12, 4
154-
; RV32-NEXT: vmv.v.i v0, 12
155-
; RV32-NEXT: vsetvli zero, zero, e64, m2, ta, mu
156-
; RV32-NEXT: vrgatherei16.vv v10, v8, v12, v0.t
157-
; RV32-NEXT: vmv.v.v v8, v10
158-
; RV32-NEXT: ret
159-
;
160-
; RV64-LABEL: vrgather_shuffle_xv_v4f64:
161-
; RV64: # %bb.0:
162-
; RV64-NEXT: vsetivli zero, 4, e64, m2, ta, ma
163-
; RV64-NEXT: vid.v v10
164-
; RV64-NEXT: vrsub.vi v12, v10, 4
165-
; RV64-NEXT: lui a0, %hi(.LCPI7_0)
166-
; RV64-NEXT: addi a0, a0, %lo(.LCPI7_0)
167-
; RV64-NEXT: vlse64.v v10, (a0), zero
168-
; RV64-NEXT: vsetivli zero, 1, e8, mf8, ta, ma
169-
; RV64-NEXT: vmv.v.i v0, 12
170-
; RV64-NEXT: vsetivli zero, 4, e64, m2, ta, mu
171-
; RV64-NEXT: vrgather.vv v10, v8, v12, v0.t
172-
; RV64-NEXT: vmv.v.v v8, v10
173-
; RV64-NEXT: ret
136+
; CHECK-LABEL: vrgather_shuffle_xv_v4f64:
137+
; CHECK: # %bb.0:
138+
; CHECK-NEXT: vsetivli zero, 4, e16, mf2, ta, ma
139+
; CHECK-NEXT: vid.v v12
140+
; CHECK-NEXT: lui a0, %hi(.LCPI7_0)
141+
; CHECK-NEXT: addi a0, a0, %lo(.LCPI7_0)
142+
; CHECK-NEXT: vlse64.v v10, (a0), zero
143+
; CHECK-NEXT: vrsub.vi v12, v12, 4
144+
; CHECK-NEXT: vmv.v.i v0, 12
145+
; CHECK-NEXT: vsetvli zero, zero, e64, m2, ta, mu
146+
; CHECK-NEXT: vrgatherei16.vv v10, v8, v12, v0.t
147+
; CHECK-NEXT: vmv.v.v v8, v10
148+
; CHECK-NEXT: ret
174149
%s = shufflevector <4 x double> <double 2.0, double 2.0, double 2.0, double 2.0>, <4 x double> %x, <4 x i32> <i32 0, i32 3, i32 6, i32 5>
175150
ret <4 x double> %s
176151
}
@@ -193,17 +168,16 @@ define <4 x double> @vrgather_shuffle_vx_v4f64(<4 x double> %x) {
193168
;
194169
; RV64-LABEL: vrgather_shuffle_vx_v4f64:
195170
; RV64: # %bb.0:
196-
; RV64-NEXT: vsetivli zero, 4, e64, m2, ta, ma
197-
; RV64-NEXT: vid.v v10
198-
; RV64-NEXT: li a0, 3
199-
; RV64-NEXT: vmul.vx v12, v10, a0
171+
; RV64-NEXT: vsetivli zero, 4, e16, mf2, ta, ma
172+
; RV64-NEXT: vid.v v12
200173
; RV64-NEXT: lui a0, %hi(.LCPI8_0)
201174
; RV64-NEXT: addi a0, a0, %lo(.LCPI8_0)
202175
; RV64-NEXT: vlse64.v v10, (a0), zero
203-
; RV64-NEXT: vsetivli zero, 1, e8, mf8, ta, ma
176+
; RV64-NEXT: li a0, 3
177+
; RV64-NEXT: vmul.vx v12, v12, a0
204178
; RV64-NEXT: vmv.v.i v0, 3
205-
; RV64-NEXT: vsetivli zero, 4, e64, m2, ta, mu
206-
; RV64-NEXT: vrgather.vv v10, v8, v12, v0.t
179+
; RV64-NEXT: vsetvli zero, zero, e64, m2, ta, mu
180+
; RV64-NEXT: vrgatherei16.vv v10, v8, v12, v0.t
207181
; RV64-NEXT: vmv.v.v v8, v10
208182
; RV64-NEXT: ret
209183
%s = shufflevector <4 x double> %x, <4 x double> <double 2.0, double 2.0, double 2.0, double 2.0>, <4 x i32> <i32 0, i32 3, i32 6, i32 5>

0 commit comments

Comments
 (0)