Skip to content

Commit 88738a7

Browse files
authored
[RISCV] Optimize two source deinterleave2 via ri.vunzip2{a,b} (#142667)
As done for the existing vnsrl cases, we can split a two source deinterleave2 into two single source deinterleave2 and a slideup. We can also use a concat-then-deinterleave2 tactic. Both are equally valid (except in the m8 source type case), and the concat-then-deinterleave2 saves one instruction for fractional LMUL cases. Additionally, if we happen to know the exact VLEN and our fixed vectors are an even number of vector registers, we can avoid the need to split or concat entirely and just use both registers sources.
1 parent 3531cc1 commit 88738a7

File tree

2 files changed

+147
-61
lines changed

2 files changed

+147
-61
lines changed

llvm/lib/Target/RISCV/RISCVISelLowering.cpp

Lines changed: 28 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -5830,13 +5830,41 @@ static SDValue lowerVECTOR_SHUFFLE(SDValue Op, SelectionDAG &DAG,
58305830
Index == 0 ? RISCVISD::RI_VUNZIP2A_VL : RISCVISD::RI_VUNZIP2B_VL;
58315831
if (V2.isUndef())
58325832
return lowerVZIP(Opc, V1, V2, DL, DAG, Subtarget);
5833+
if (auto VLEN = Subtarget.getRealVLen();
5834+
VLEN && VT.getSizeInBits().getKnownMinValue() % *VLEN == 0)
5835+
return lowerVZIP(Opc, V1, V2, DL, DAG, Subtarget);
58335836
if (SDValue Src = foldConcatVector(V1, V2)) {
58345837
EVT NewVT = VT.getDoubleNumVectorElementsVT();
58355838
Src = DAG.getExtractSubvector(DL, NewVT, Src, 0);
58365839
SDValue Res =
58375840
lowerVZIP(Opc, Src, DAG.getUNDEF(NewVT), DL, DAG, Subtarget);
58385841
return DAG.getExtractSubvector(DL, VT, Res, 0);
58395842
}
5843+
// Deinterleave each source and concatenate them, or concat first, then
5844+
// deinterleave.
5845+
if (1 < count_if(Mask,
5846+
[&Mask](int Idx) { return Idx < (int)Mask.size(); }) &&
5847+
1 < count_if(Mask,
5848+
[&Mask](int Idx) { return Idx >= (int)Mask.size(); })) {
5849+
5850+
const unsigned EltSize = VT.getScalarSizeInBits();
5851+
const unsigned MinVLMAX = Subtarget.getRealMinVLen() / EltSize;
5852+
if (NumElts < MinVLMAX) {
5853+
MVT ConcatVT = VT.getDoubleNumVectorElementsVT();
5854+
SDValue Concat = DAG.getNode(ISD::CONCAT_VECTORS, DL, ConcatVT, V1, V2);
5855+
SDValue Res =
5856+
lowerVZIP(Opc, Concat, DAG.getUNDEF(ConcatVT), DL, DAG, Subtarget);
5857+
return DAG.getExtractSubvector(DL, VT, Res, 0);
5858+
}
5859+
5860+
SDValue Lo = lowerVZIP(Opc, V1, DAG.getUNDEF(VT), DL, DAG, Subtarget);
5861+
SDValue Hi = lowerVZIP(Opc, V2, DAG.getUNDEF(VT), DL, DAG, Subtarget);
5862+
5863+
MVT SubVT = VT.getHalfNumVectorElementsVT();
5864+
return DAG.getNode(ISD::CONCAT_VECTORS, DL, VT,
5865+
DAG.getExtractSubvector(DL, SubVT, Lo, 0),
5866+
DAG.getExtractSubvector(DL, SubVT, Hi, 0));
5867+
}
58405868
}
58415869

58425870
if (SDValue V =

llvm/test/CodeGen/RISCV/rvv/fixed-vectors-shuffle-deinterleave2.ll

Lines changed: 119 additions & 61 deletions
Original file line numberDiff line numberDiff line change
@@ -1364,13 +1364,11 @@ define <4 x i64> @unzip2a_dual_v4i64(<4 x i64> %a, <4 x i64> %b) {
13641364
;
13651365
; ZIP-LABEL: unzip2a_dual_v4i64:
13661366
; ZIP: # %bb.0: # %entry
1367-
; ZIP-NEXT: vsetivli zero, 4, e64, m1, ta, mu
1368-
; ZIP-NEXT: vmv.v.i v0, 8
1369-
; ZIP-NEXT: vslideup.vi v10, v9, 2
1370-
; ZIP-NEXT: vslideup.vi v10, v9, 1, v0.t
1371-
; ZIP-NEXT: vmv.v.i v0, 12
1372-
; ZIP-NEXT: ri.vunzip2a.vv v11, v8, v9
1373-
; ZIP-NEXT: vmerge.vvm v8, v11, v10, v0
1367+
; ZIP-NEXT: vsetivli zero, 4, e64, m1, ta, ma
1368+
; ZIP-NEXT: ri.vunzip2a.vv v11, v9, v10
1369+
; ZIP-NEXT: ri.vunzip2a.vv v9, v8, v10
1370+
; ZIP-NEXT: vslideup.vi v9, v11, 2
1371+
; ZIP-NEXT: vmv.v.v v8, v9
13741372
; ZIP-NEXT: ret
13751373
entry:
13761374
%c = shufflevector <4 x i64> %a, <4 x i64> %b, <4 x i32> <i32 0, i32 2, i32 4, i32 6>
@@ -1502,16 +1500,11 @@ define <16 x i64> @unzip2a_dual_v16i64(<16 x i64> %a, <16 x i64> %b) {
15021500
; ZIP-LABEL: unzip2a_dual_v16i64:
15031501
; ZIP: # %bb.0: # %entry
15041502
; ZIP-NEXT: vsetivli zero, 8, e64, m2, ta, ma
1505-
; ZIP-NEXT: ri.vunzip2a.vv v16, v8, v10
1506-
; ZIP-NEXT: vsetivli zero, 16, e16, m1, ta, ma
1507-
; ZIP-NEXT: vid.v v8
1508-
; ZIP-NEXT: li a0, -256
1509-
; ZIP-NEXT: vadd.vv v8, v8, v8
1510-
; ZIP-NEXT: vmv.s.x v0, a0
1511-
; ZIP-NEXT: vadd.vi v8, v8, -16
1512-
; ZIP-NEXT: vsetvli zero, zero, e64, m4, ta, mu
1513-
; ZIP-NEXT: vrgatherei16.vv v16, v12, v8, v0.t
1514-
; ZIP-NEXT: vmv.v.v v8, v16
1503+
; ZIP-NEXT: ri.vunzip2a.vv v16, v12, v14
1504+
; ZIP-NEXT: ri.vunzip2a.vv v12, v8, v10
1505+
; ZIP-NEXT: vsetivli zero, 16, e64, m4, ta, ma
1506+
; ZIP-NEXT: vslideup.vi v12, v16, 8
1507+
; ZIP-NEXT: vmv.v.v v8, v12
15151508
; ZIP-NEXT: ret
15161509
entry:
15171510
%c = shufflevector <16 x i64> %a, <16 x i64> %b, <16 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14, i32 16, i32 18, i32 20, i32 22, i32 24, i32 26, i32 28, i32 30>
@@ -1557,13 +1550,9 @@ define <4 x i64> @unzip2a_dual_v4i64_exact(<4 x i64> %a, <4 x i64> %b) vscale_ra
15571550
;
15581551
; ZIP-LABEL: unzip2a_dual_v4i64_exact:
15591552
; ZIP: # %bb.0: # %entry
1560-
; ZIP-NEXT: vsetivli zero, 4, e64, m1, ta, mu
1561-
; ZIP-NEXT: vmv.v.i v0, 8
1562-
; ZIP-NEXT: vslideup.vi v10, v9, 2
1563-
; ZIP-NEXT: vslideup.vi v10, v9, 1, v0.t
1564-
; ZIP-NEXT: vmv.v.i v0, 12
1565-
; ZIP-NEXT: ri.vunzip2a.vv v11, v8, v9
1566-
; ZIP-NEXT: vmerge.vvm v8, v11, v10, v0
1553+
; ZIP-NEXT: vsetivli zero, 4, e64, m1, ta, ma
1554+
; ZIP-NEXT: ri.vunzip2a.vv v10, v8, v9
1555+
; ZIP-NEXT: vmv.v.v v8, v10
15671556
; ZIP-NEXT: ret
15681557
entry:
15691558
%c = shufflevector <4 x i64> %a, <4 x i64> %b, <4 x i32> <i32 0, i32 2, i32 4, i32 6>
@@ -1609,13 +1598,10 @@ define <4 x i64> @unzip2a_dual_v4i64_exact_nf2(<4 x i64> %a, <4 x i64> %b) vscal
16091598
;
16101599
; ZIP-LABEL: unzip2a_dual_v4i64_exact_nf2:
16111600
; ZIP: # %bb.0: # %entry
1612-
; ZIP-NEXT: vsetivli zero, 4, e64, m1, ta, mu
1613-
; ZIP-NEXT: vmv.v.i v0, 8
1614-
; ZIP-NEXT: vslideup.vi v10, v9, 2
1615-
; ZIP-NEXT: vslideup.vi v10, v9, 1, v0.t
1616-
; ZIP-NEXT: vmv.v.i v0, 12
1617-
; ZIP-NEXT: ri.vunzip2a.vv v11, v8, v9
1618-
; ZIP-NEXT: vmerge.vvm v8, v11, v10, v0
1601+
; ZIP-NEXT: vsetivli zero, 8, e64, m1, ta, ma
1602+
; ZIP-NEXT: vslideup.vi v8, v9, 4
1603+
; ZIP-NEXT: ri.vunzip2a.vv v9, v8, v10
1604+
; ZIP-NEXT: vmv.v.v v8, v9
16191605
; ZIP-NEXT: ret
16201606
entry:
16211607
%c = shufflevector <4 x i64> %a, <4 x i64> %b, <4 x i32> <i32 0, i32 2, i32 4, i32 6>
@@ -1740,39 +1726,111 @@ define <16 x i64> @unzip2a_dual_v16i64_exact(<16 x i64> %a, <16 x i64> %b) vscal
17401726
;
17411727
; ZIP-LABEL: unzip2a_dual_v16i64_exact:
17421728
; ZIP: # %bb.0: # %entry
1743-
; ZIP-NEXT: vsetivli zero, 4, e64, m1, ta, mu
1744-
; ZIP-NEXT: vslideup.vi v18, v15, 2
1745-
; ZIP-NEXT: vmv.v.i v16, 8
1746-
; ZIP-NEXT: vmv.v.i v17, 12
1747-
; ZIP-NEXT: vslideup.vi v20, v13, 2
1748-
; ZIP-NEXT: vmv.v.v v0, v16
1749-
; ZIP-NEXT: vslideup.vi v18, v15, 1, v0.t
1750-
; ZIP-NEXT: ri.vunzip2a.vv v15, v14, v19
1751-
; ZIP-NEXT: vmv.v.v v0, v17
1752-
; ZIP-NEXT: vmerge.vvm v15, v15, v18, v0
1753-
; ZIP-NEXT: vmv.v.v v0, v16
1754-
; ZIP-NEXT: vslideup.vi v20, v13, 1, v0.t
1755-
; ZIP-NEXT: ri.vunzip2a.vv v14, v12, v13
1756-
; ZIP-NEXT: vslideup.vi v12, v11, 2
1757-
; ZIP-NEXT: vslideup.vi v18, v9, 2
1758-
; ZIP-NEXT: vmv.v.v v0, v17
1759-
; ZIP-NEXT: vmerge.vvm v14, v14, v20, v0
1760-
; ZIP-NEXT: li a0, -256
1761-
; ZIP-NEXT: ri.vunzip2a.vv v20, v10, v13
1762-
; ZIP-NEXT: ri.vunzip2a.vv v10, v8, v19
1763-
; ZIP-NEXT: vmv.v.v v0, v16
1764-
; ZIP-NEXT: vslideup.vi v12, v11, 1, v0.t
1765-
; ZIP-NEXT: vmv.v.v v0, v17
1766-
; ZIP-NEXT: vmerge.vvm v13, v20, v12, v0
1767-
; ZIP-NEXT: vmv.v.v v0, v16
1768-
; ZIP-NEXT: vslideup.vi v18, v9, 1, v0.t
1769-
; ZIP-NEXT: vmv.v.v v0, v17
1770-
; ZIP-NEXT: vmerge.vvm v12, v10, v18, v0
1771-
; ZIP-NEXT: vmv.s.x v0, a0
17721729
; ZIP-NEXT: vsetivli zero, 16, e64, m4, ta, ma
1773-
; ZIP-NEXT: vmerge.vvm v8, v12, v12, v0
1730+
; ZIP-NEXT: ri.vunzip2a.vv v16, v8, v12
1731+
; ZIP-NEXT: vmv.v.v v8, v16
17741732
; ZIP-NEXT: ret
17751733
entry:
17761734
%c = shufflevector <16 x i64> %a, <16 x i64> %b, <16 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14, i32 16, i32 18, i32 20, i32 22, i32 24, i32 26, i32 28, i32 30>
17771735
ret <16 x i64> %c
17781736
}
1737+
1738+
define <4 x i64> @unzip2b_dual_v4i64(<4 x i64> %a, <4 x i64> %b) {
1739+
; V-LABEL: unzip2b_dual_v4i64:
1740+
; V: # %bb.0: # %entry
1741+
; V-NEXT: vsetivli zero, 4, e64, m1, ta, mu
1742+
; V-NEXT: vmv.v.i v0, 2
1743+
; V-NEXT: vslidedown.vi v10, v8, 1
1744+
; V-NEXT: vslidedown.vi v10, v8, 2, v0.t
1745+
; V-NEXT: vmv.v.i v0, 4
1746+
; V-NEXT: vmv1r.v v8, v9
1747+
; V-NEXT: vslideup.vi v8, v9, 1, v0.t
1748+
; V-NEXT: vmv.v.i v0, 12
1749+
; V-NEXT: vmerge.vvm v8, v10, v8, v0
1750+
; V-NEXT: ret
1751+
;
1752+
; ZVE32F-LABEL: unzip2b_dual_v4i64:
1753+
; ZVE32F: # %bb.0: # %entry
1754+
; ZVE32F-NEXT: ld a3, 8(a2)
1755+
; ZVE32F-NEXT: ld a2, 24(a2)
1756+
; ZVE32F-NEXT: ld a4, 8(a1)
1757+
; ZVE32F-NEXT: ld a1, 24(a1)
1758+
; ZVE32F-NEXT: vsetivli zero, 8, e32, m1, ta, mu
1759+
; ZVE32F-NEXT: vmv.v.i v0, 15
1760+
; ZVE32F-NEXT: srli a5, a2, 32
1761+
; ZVE32F-NEXT: srli a6, a3, 32
1762+
; ZVE32F-NEXT: srli a7, a1, 32
1763+
; ZVE32F-NEXT: srli t0, a4, 32
1764+
; ZVE32F-NEXT: vmv.v.x v8, a4
1765+
; ZVE32F-NEXT: vmv.v.x v9, a3
1766+
; ZVE32F-NEXT: vslide1down.vx v8, v8, t0
1767+
; ZVE32F-NEXT: vslide1down.vx v9, v9, a6
1768+
; ZVE32F-NEXT: vslide1down.vx v8, v8, a1
1769+
; ZVE32F-NEXT: vslide1down.vx v9, v9, a2
1770+
; ZVE32F-NEXT: vslide1down.vx v8, v8, a7
1771+
; ZVE32F-NEXT: vslide1down.vx v9, v9, a5
1772+
; ZVE32F-NEXT: vslidedown.vi v9, v8, 4, v0.t
1773+
; ZVE32F-NEXT: vse32.v v9, (a0)
1774+
; ZVE32F-NEXT: ret
1775+
;
1776+
; ZIP-LABEL: unzip2b_dual_v4i64:
1777+
; ZIP: # %bb.0: # %entry
1778+
; ZIP-NEXT: vsetivli zero, 4, e64, m1, ta, ma
1779+
; ZIP-NEXT: ri.vunzip2b.vv v11, v9, v10
1780+
; ZIP-NEXT: ri.vunzip2b.vv v9, v8, v10
1781+
; ZIP-NEXT: vslideup.vi v9, v11, 2
1782+
; ZIP-NEXT: vmv.v.v v8, v9
1783+
; ZIP-NEXT: ret
1784+
entry:
1785+
%c = shufflevector <4 x i64> %a, <4 x i64> %b, <4 x i32> <i32 1, i32 3, i32 5, i32 7>
1786+
ret <4 x i64> %c
1787+
}
1788+
1789+
define <4 x i64> @unzip2b_dual_v4i64_exact(<4 x i64> %a, <4 x i64> %b) vscale_range(4,4) {
1790+
; V-LABEL: unzip2b_dual_v4i64_exact:
1791+
; V: # %bb.0: # %entry
1792+
; V-NEXT: vsetivli zero, 4, e64, m1, ta, mu
1793+
; V-NEXT: vmv.v.i v0, 2
1794+
; V-NEXT: vslidedown.vi v10, v8, 1
1795+
; V-NEXT: vslidedown.vi v10, v8, 2, v0.t
1796+
; V-NEXT: vmv.v.i v0, 4
1797+
; V-NEXT: vmv1r.v v8, v9
1798+
; V-NEXT: vslideup.vi v8, v9, 1, v0.t
1799+
; V-NEXT: vmv.v.i v0, 12
1800+
; V-NEXT: vmerge.vvm v8, v10, v8, v0
1801+
; V-NEXT: ret
1802+
;
1803+
; ZVE32F-LABEL: unzip2b_dual_v4i64_exact:
1804+
; ZVE32F: # %bb.0: # %entry
1805+
; ZVE32F-NEXT: ld a3, 8(a2)
1806+
; ZVE32F-NEXT: ld a2, 24(a2)
1807+
; ZVE32F-NEXT: ld a4, 8(a1)
1808+
; ZVE32F-NEXT: ld a1, 24(a1)
1809+
; ZVE32F-NEXT: vsetivli zero, 8, e32, m1, ta, mu
1810+
; ZVE32F-NEXT: vmv.v.i v0, 15
1811+
; ZVE32F-NEXT: srli a5, a2, 32
1812+
; ZVE32F-NEXT: srli a6, a3, 32
1813+
; ZVE32F-NEXT: srli a7, a1, 32
1814+
; ZVE32F-NEXT: srli t0, a4, 32
1815+
; ZVE32F-NEXT: vmv.v.x v8, a4
1816+
; ZVE32F-NEXT: vmv.v.x v9, a3
1817+
; ZVE32F-NEXT: vslide1down.vx v8, v8, t0
1818+
; ZVE32F-NEXT: vslide1down.vx v9, v9, a6
1819+
; ZVE32F-NEXT: vslide1down.vx v8, v8, a1
1820+
; ZVE32F-NEXT: vslide1down.vx v9, v9, a2
1821+
; ZVE32F-NEXT: vslide1down.vx v8, v8, a7
1822+
; ZVE32F-NEXT: vslide1down.vx v9, v9, a5
1823+
; ZVE32F-NEXT: vslidedown.vi v9, v8, 4, v0.t
1824+
; ZVE32F-NEXT: vs1r.v v9, (a0)
1825+
; ZVE32F-NEXT: ret
1826+
;
1827+
; ZIP-LABEL: unzip2b_dual_v4i64_exact:
1828+
; ZIP: # %bb.0: # %entry
1829+
; ZIP-NEXT: vsetivli zero, 4, e64, m1, ta, ma
1830+
; ZIP-NEXT: ri.vunzip2b.vv v10, v8, v9
1831+
; ZIP-NEXT: vmv.v.v v8, v10
1832+
; ZIP-NEXT: ret
1833+
entry:
1834+
%c = shufflevector <4 x i64> %a, <4 x i64> %b, <4 x i32> <i32 1, i32 3, i32 5, i32 7>
1835+
ret <4 x i64> %c
1836+
}

0 commit comments

Comments
 (0)