Skip to content

Commit 9b7282e

Browse files
authored
[RISCV] Recognize de-interleave shuffles with 2 sources. (#127272)
We can use vnsrl+trunc on each source and concatenate the results with vslideup. For low LMUL it would be better to concat first, but I'm leaving this for later.
1 parent 5d62a79 commit 9b7282e

File tree

5 files changed

+543
-41
lines changed

5 files changed

+543
-41
lines changed

llvm/lib/Target/RISCV/RISCVISelLowering.cpp

Lines changed: 14 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -5593,6 +5593,20 @@ static SDValue lowerVECTOR_SHUFFLE(SDValue Op, SelectionDAG &DAG,
55935593
1 < count_if(Mask, [](int Idx) { return Idx != -1; })) {
55945594
if (SDValue Src = getSingleShuffleSrc(VT, V1, V2))
55955595
return getDeinterleaveShiftAndTrunc(DL, VT, Src, Factor, Index, DAG);
5596+
if (1 < count_if(Mask,
5597+
[&Mask](int Idx) { return Idx < (int)Mask.size(); }) &&
5598+
1 < count_if(Mask, [&Mask](int Idx) {
5599+
return Idx >= (int)Mask.size();
5600+
})) {
5601+
// Narrow each source and concatenate them.
5602+
// FIXME: For small LMUL it is better to concatenate first.
5603+
MVT HalfVT = VT.getHalfNumVectorElementsVT();
5604+
SDValue Lo =
5605+
getDeinterleaveShiftAndTrunc(DL, HalfVT, V1, Factor, Index, DAG);
5606+
SDValue Hi =
5607+
getDeinterleaveShiftAndTrunc(DL, HalfVT, V2, Factor, Index, DAG);
5608+
return DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, Lo, Hi);
5609+
}
55965610
}
55975611
}
55985612
}

llvm/test/CodeGen/RISCV/rvv/fixed-vectors-deinterleave-load.ll

Lines changed: 13 additions & 21 deletions
Original file line numberDiff line numberDiff line change
@@ -10,34 +10,26 @@ define {<16 x i1>, <16 x i1>} @vector_deinterleave_load_v16i1_v32i1(ptr %p) {
1010
; CHECK-LABEL: vector_deinterleave_load_v16i1_v32i1:
1111
; CHECK: # %bb.0:
1212
; CHECK-NEXT: li a1, 32
13-
; CHECK-NEXT: vsetivli zero, 16, e8, m1, ta, ma
14-
; CHECK-NEXT: vmv.v.i v10, 0
15-
; CHECK-NEXT: vid.v v9
1613
; CHECK-NEXT: vsetvli zero, a1, e8, m2, ta, ma
17-
; CHECK-NEXT: vlm.v v8, (a0)
18-
; CHECK-NEXT: li a0, -256
14+
; CHECK-NEXT: vlm.v v0, (a0)
1915
; CHECK-NEXT: vsetivli zero, 16, e8, m1, ta, ma
20-
; CHECK-NEXT: vadd.vv v11, v9, v9
21-
; CHECK-NEXT: vsetvli zero, zero, e16, m2, ta, ma
22-
; CHECK-NEXT: vmv.s.x v9, a0
23-
; CHECK-NEXT: vsetvli zero, zero, e8, m1, ta, ma
24-
; CHECK-NEXT: vadd.vi v12, v11, -16
16+
; CHECK-NEXT: vmv.v.i v8, 0
17+
; CHECK-NEXT: vmerge.vim v9, v8, 1, v0
2518
; CHECK-NEXT: vsetivli zero, 2, e8, mf4, ta, ma
26-
; CHECK-NEXT: vslidedown.vi v0, v8, 2
19+
; CHECK-NEXT: vslidedown.vi v0, v0, 2
20+
; CHECK-NEXT: vsetivli zero, 8, e8, mf2, ta, ma
21+
; CHECK-NEXT: vnsrl.wi v10, v9, 0
2722
; CHECK-NEXT: vsetivli zero, 16, e8, m1, ta, ma
28-
; CHECK-NEXT: vadd.vi v11, v11, -15
29-
; CHECK-NEXT: vmerge.vim v13, v10, 1, v0
30-
; CHECK-NEXT: vmv1r.v v0, v8
31-
; CHECK-NEXT: vmerge.vim v8, v10, 1, v0
23+
; CHECK-NEXT: vmerge.vim v8, v8, 1, v0
3224
; CHECK-NEXT: vsetivli zero, 8, e8, mf2, ta, ma
33-
; CHECK-NEXT: vnsrl.wi v10, v8, 0
25+
; CHECK-NEXT: vnsrl.wi v9, v9, 8
26+
; CHECK-NEXT: vnsrl.wi v11, v8, 0
3427
; CHECK-NEXT: vnsrl.wi v8, v8, 8
35-
; CHECK-NEXT: vmv1r.v v0, v9
36-
; CHECK-NEXT: vsetivli zero, 16, e8, m1, ta, mu
37-
; CHECK-NEXT: vrgather.vv v10, v13, v12, v0.t
38-
; CHECK-NEXT: vrgather.vv v8, v13, v11, v0.t
28+
; CHECK-NEXT: vsetivli zero, 16, e8, m1, ta, ma
29+
; CHECK-NEXT: vslideup.vi v10, v11, 8
30+
; CHECK-NEXT: vslideup.vi v9, v8, 8
3931
; CHECK-NEXT: vmsne.vi v0, v10, 0
40-
; CHECK-NEXT: vmsne.vi v8, v8, 0
32+
; CHECK-NEXT: vmsne.vi v8, v9, 0
4133
; CHECK-NEXT: ret
4234
%vec = load <32 x i1>, ptr %p
4335
%deinterleaved.results = call {<16 x i1>, <16 x i1>} @llvm.vector.deinterleave2.v32i1(<32 x i1> %vec)

llvm/test/CodeGen/RISCV/rvv/fixed-vectors-shuffle-deinterleave.ll

Lines changed: 86 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -369,3 +369,89 @@ entry:
369369
store <2 x i8> %shuffle.i5, ptr %out, align 1
370370
ret void
371371
}
372+
373+
define void @deinterleave4_0_i8_two_source(ptr %in0, ptr %in1, ptr %out) {
374+
; CHECK-LABEL: deinterleave4_0_i8_two_source:
375+
; CHECK: # %bb.0: # %entry
376+
; CHECK-NEXT: vsetivli zero, 8, e8, mf2, ta, ma
377+
; CHECK-NEXT: vle8.v v8, (a1)
378+
; CHECK-NEXT: vle8.v v9, (a0)
379+
; CHECK-NEXT: vsetivli zero, 2, e16, mf4, ta, ma
380+
; CHECK-NEXT: vnsrl.wi v8, v8, 0
381+
; CHECK-NEXT: vnsrl.wi v9, v9, 0
382+
; CHECK-NEXT: vsetvli zero, zero, e8, mf8, ta, ma
383+
; CHECK-NEXT: vnsrl.wi v8, v8, 0
384+
; CHECK-NEXT: vnsrl.wi v9, v9, 0
385+
; CHECK-NEXT: vsetivli zero, 8, e8, mf2, ta, ma
386+
; CHECK-NEXT: vslideup.vi v9, v8, 4
387+
; CHECK-NEXT: vse8.v v9, (a2)
388+
; CHECK-NEXT: ret
389+
entry:
390+
%0 = load <8 x i8>, ptr %in0, align 1
391+
%1 = load <8 x i8>, ptr %in1, align 1
392+
%shuffle.i5 = shufflevector <8 x i8> %0, <8 x i8> %1, <8 x i32> <i32 0, i32 4, i32 8, i32 12, i32 undef, i32 undef, i32 undef, i32 undef>
393+
store <8 x i8> %shuffle.i5, ptr %out, align 1
394+
ret void
395+
}
396+
397+
define void @deinterleave4_8_i8_two_source(ptr %in0, ptr %in1, ptr %out) {
398+
; CHECK-LABEL: deinterleave4_8_i8_two_source:
399+
; CHECK: # %bb.0: # %entry
400+
; CHECK-NEXT: vsetivli zero, 8, e8, mf2, ta, ma
401+
; CHECK-NEXT: vle8.v v8, (a1)
402+
; CHECK-NEXT: vle8.v v9, (a0)
403+
; CHECK-NEXT: vsetivli zero, 2, e16, mf4, ta, ma
404+
; CHECK-NEXT: vnsrl.wi v8, v8, 8
405+
; CHECK-NEXT: vnsrl.wi v9, v9, 8
406+
; CHECK-NEXT: vsetvli zero, zero, e8, mf8, ta, ma
407+
; CHECK-NEXT: vnsrl.wi v8, v8, 0
408+
; CHECK-NEXT: vnsrl.wi v9, v9, 0
409+
; CHECK-NEXT: vsetivli zero, 8, e8, mf2, ta, ma
410+
; CHECK-NEXT: vslideup.vi v9, v8, 4
411+
; CHECK-NEXT: vse8.v v9, (a2)
412+
; CHECK-NEXT: ret
413+
entry:
414+
%0 = load <8 x i8>, ptr %in0, align 1
415+
%1 = load <8 x i8>, ptr %in1, align 1
416+
%shuffle.i5 = shufflevector <8 x i8> %0, <8 x i8> %1, <8 x i32> <i32 1, i32 5, i32 9, i32 13, i32 undef, i32 undef, i32 undef, i32 undef>
417+
store <8 x i8> %shuffle.i5, ptr %out, align 1
418+
ret void
419+
}
420+
421+
define void @deinterleave8_0_i8_two_source(ptr %in0, ptr %in1, ptr %out) {
422+
; CHECK-LABEL: deinterleave8_0_i8_two_source:
423+
; CHECK: # %bb.0: # %entry
424+
; CHECK-NEXT: vsetivli zero, 8, e8, mf2, ta, ma
425+
; CHECK-NEXT: vle8.v v8, (a0)
426+
; CHECK-NEXT: vsetivli zero, 2, e8, mf2, ta, ma
427+
; CHECK-NEXT: vle8.v v9, (a1)
428+
; CHECK-NEXT: vsetvli zero, zero, e8, mf2, tu, ma
429+
; CHECK-NEXT: vslideup.vi v8, v9, 1
430+
; CHECK-NEXT: vsetivli zero, 8, e8, mf2, ta, ma
431+
; CHECK-NEXT: vse8.v v8, (a2)
432+
; CHECK-NEXT: ret
433+
entry:
434+
%0 = load <8 x i8>, ptr %in0, align 1
435+
%1 = load <8 x i8>, ptr %in1, align 1
436+
%shuffle.i5 = shufflevector <8 x i8> %0, <8 x i8> %1, <8 x i32> <i32 0, i32 8, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
437+
store <8 x i8> %shuffle.i5, ptr %out, align 1
438+
ret void
439+
}
440+
441+
define void @deinterleave8_8_i8_two_source(ptr %in0, ptr %in1, ptr %out) {
442+
; CHECK-LABEL: deinterleave8_8_i8_two_source:
443+
; CHECK: # %bb.0: # %entry
444+
; CHECK-NEXT: vsetivli zero, 8, e8, mf2, ta, mu
445+
; CHECK-NEXT: vle8.v v8, (a0)
446+
; CHECK-NEXT: vle8.v v9, (a1)
447+
; CHECK-NEXT: vmv.v.i v0, -3
448+
; CHECK-NEXT: vrgather.vi v9, v8, 1, v0.t
449+
; CHECK-NEXT: vse8.v v9, (a2)
450+
; CHECK-NEXT: ret
451+
entry:
452+
%0 = load <8 x i8>, ptr %in0, align 1
453+
%1 = load <8 x i8>, ptr %in1, align 1
454+
%shuffle.i5 = shufflevector <8 x i8> %0, <8 x i8> %1, <8 x i32> <i32 1, i32 9, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
455+
store <8 x i8> %shuffle.i5, ptr %out, align 1
456+
ret void
457+
}

0 commit comments

Comments
 (0)