@@ -1364,13 +1364,11 @@ define <4 x i64> @unzip2a_dual_v4i64(<4 x i64> %a, <4 x i64> %b) {
1364
1364
;
1365
1365
; ZIP-LABEL: unzip2a_dual_v4i64:
1366
1366
; ZIP: # %bb.0: # %entry
1367
- ; ZIP-NEXT: vsetivli zero, 4, e64, m1, ta, mu
1368
- ; ZIP-NEXT: vmv.v.i v0, 8
1369
- ; ZIP-NEXT: vslideup.vi v10, v9, 2
1370
- ; ZIP-NEXT: vslideup.vi v10, v9, 1, v0.t
1371
- ; ZIP-NEXT: vmv.v.i v0, 12
1372
- ; ZIP-NEXT: ri.vunzip2a.vv v11, v8, v9
1373
- ; ZIP-NEXT: vmerge.vvm v8, v11, v10, v0
1367
+ ; ZIP-NEXT: vsetivli zero, 4, e64, m1, ta, ma
1368
+ ; ZIP-NEXT: ri.vunzip2a.vv v11, v9, v10
1369
+ ; ZIP-NEXT: ri.vunzip2a.vv v9, v8, v10
1370
+ ; ZIP-NEXT: vslideup.vi v9, v11, 2
1371
+ ; ZIP-NEXT: vmv.v.v v8, v9
1374
1372
; ZIP-NEXT: ret
1375
1373
entry:
1376
1374
%c = shufflevector <4 x i64 > %a , <4 x i64 > %b , <4 x i32 > <i32 0 , i32 2 , i32 4 , i32 6 >
@@ -1502,16 +1500,11 @@ define <16 x i64> @unzip2a_dual_v16i64(<16 x i64> %a, <16 x i64> %b) {
1502
1500
; ZIP-LABEL: unzip2a_dual_v16i64:
1503
1501
; ZIP: # %bb.0: # %entry
1504
1502
; ZIP-NEXT: vsetivli zero, 8, e64, m2, ta, ma
1505
- ; ZIP-NEXT: ri.vunzip2a.vv v16, v8, v10
1506
- ; ZIP-NEXT: vsetivli zero, 16, e16, m1, ta, ma
1507
- ; ZIP-NEXT: vid.v v8
1508
- ; ZIP-NEXT: li a0, -256
1509
- ; ZIP-NEXT: vadd.vv v8, v8, v8
1510
- ; ZIP-NEXT: vmv.s.x v0, a0
1511
- ; ZIP-NEXT: vadd.vi v8, v8, -16
1512
- ; ZIP-NEXT: vsetvli zero, zero, e64, m4, ta, mu
1513
- ; ZIP-NEXT: vrgatherei16.vv v16, v12, v8, v0.t
1514
- ; ZIP-NEXT: vmv.v.v v8, v16
1503
+ ; ZIP-NEXT: ri.vunzip2a.vv v16, v12, v14
1504
+ ; ZIP-NEXT: ri.vunzip2a.vv v12, v8, v10
1505
+ ; ZIP-NEXT: vsetivli zero, 16, e64, m4, ta, ma
1506
+ ; ZIP-NEXT: vslideup.vi v12, v16, 8
1507
+ ; ZIP-NEXT: vmv.v.v v8, v12
1515
1508
; ZIP-NEXT: ret
1516
1509
entry:
1517
1510
%c = shufflevector <16 x i64 > %a , <16 x i64 > %b , <16 x i32 > <i32 0 , i32 2 , i32 4 , i32 6 , i32 8 , i32 10 , i32 12 , i32 14 , i32 16 , i32 18 , i32 20 , i32 22 , i32 24 , i32 26 , i32 28 , i32 30 >
@@ -1557,13 +1550,9 @@ define <4 x i64> @unzip2a_dual_v4i64_exact(<4 x i64> %a, <4 x i64> %b) vscale_ra
1557
1550
;
1558
1551
; ZIP-LABEL: unzip2a_dual_v4i64_exact:
1559
1552
; ZIP: # %bb.0: # %entry
1560
- ; ZIP-NEXT: vsetivli zero, 4, e64, m1, ta, mu
1561
- ; ZIP-NEXT: vmv.v.i v0, 8
1562
- ; ZIP-NEXT: vslideup.vi v10, v9, 2
1563
- ; ZIP-NEXT: vslideup.vi v10, v9, 1, v0.t
1564
- ; ZIP-NEXT: vmv.v.i v0, 12
1565
- ; ZIP-NEXT: ri.vunzip2a.vv v11, v8, v9
1566
- ; ZIP-NEXT: vmerge.vvm v8, v11, v10, v0
1553
+ ; ZIP-NEXT: vsetivli zero, 4, e64, m1, ta, ma
1554
+ ; ZIP-NEXT: ri.vunzip2a.vv v10, v8, v9
1555
+ ; ZIP-NEXT: vmv.v.v v8, v10
1567
1556
; ZIP-NEXT: ret
1568
1557
entry:
1569
1558
%c = shufflevector <4 x i64 > %a , <4 x i64 > %b , <4 x i32 > <i32 0 , i32 2 , i32 4 , i32 6 >
@@ -1609,13 +1598,10 @@ define <4 x i64> @unzip2a_dual_v4i64_exact_nf2(<4 x i64> %a, <4 x i64> %b) vscal
1609
1598
;
1610
1599
; ZIP-LABEL: unzip2a_dual_v4i64_exact_nf2:
1611
1600
; ZIP: # %bb.0: # %entry
1612
- ; ZIP-NEXT: vsetivli zero, 4, e64, m1, ta, mu
1613
- ; ZIP-NEXT: vmv.v.i v0, 8
1614
- ; ZIP-NEXT: vslideup.vi v10, v9, 2
1615
- ; ZIP-NEXT: vslideup.vi v10, v9, 1, v0.t
1616
- ; ZIP-NEXT: vmv.v.i v0, 12
1617
- ; ZIP-NEXT: ri.vunzip2a.vv v11, v8, v9
1618
- ; ZIP-NEXT: vmerge.vvm v8, v11, v10, v0
1601
+ ; ZIP-NEXT: vsetivli zero, 8, e64, m1, ta, ma
1602
+ ; ZIP-NEXT: vslideup.vi v8, v9, 4
1603
+ ; ZIP-NEXT: ri.vunzip2a.vv v9, v8, v10
1604
+ ; ZIP-NEXT: vmv.v.v v8, v9
1619
1605
; ZIP-NEXT: ret
1620
1606
entry:
1621
1607
%c = shufflevector <4 x i64 > %a , <4 x i64 > %b , <4 x i32 > <i32 0 , i32 2 , i32 4 , i32 6 >
@@ -1740,39 +1726,111 @@ define <16 x i64> @unzip2a_dual_v16i64_exact(<16 x i64> %a, <16 x i64> %b) vscal
1740
1726
;
1741
1727
; ZIP-LABEL: unzip2a_dual_v16i64_exact:
1742
1728
; ZIP: # %bb.0: # %entry
1743
- ; ZIP-NEXT: vsetivli zero, 4, e64, m1, ta, mu
1744
- ; ZIP-NEXT: vslideup.vi v18, v15, 2
1745
- ; ZIP-NEXT: vmv.v.i v16, 8
1746
- ; ZIP-NEXT: vmv.v.i v17, 12
1747
- ; ZIP-NEXT: vslideup.vi v20, v13, 2
1748
- ; ZIP-NEXT: vmv.v.v v0, v16
1749
- ; ZIP-NEXT: vslideup.vi v18, v15, 1, v0.t
1750
- ; ZIP-NEXT: ri.vunzip2a.vv v15, v14, v19
1751
- ; ZIP-NEXT: vmv.v.v v0, v17
1752
- ; ZIP-NEXT: vmerge.vvm v15, v15, v18, v0
1753
- ; ZIP-NEXT: vmv.v.v v0, v16
1754
- ; ZIP-NEXT: vslideup.vi v20, v13, 1, v0.t
1755
- ; ZIP-NEXT: ri.vunzip2a.vv v14, v12, v13
1756
- ; ZIP-NEXT: vslideup.vi v12, v11, 2
1757
- ; ZIP-NEXT: vslideup.vi v18, v9, 2
1758
- ; ZIP-NEXT: vmv.v.v v0, v17
1759
- ; ZIP-NEXT: vmerge.vvm v14, v14, v20, v0
1760
- ; ZIP-NEXT: li a0, -256
1761
- ; ZIP-NEXT: ri.vunzip2a.vv v20, v10, v13
1762
- ; ZIP-NEXT: ri.vunzip2a.vv v10, v8, v19
1763
- ; ZIP-NEXT: vmv.v.v v0, v16
1764
- ; ZIP-NEXT: vslideup.vi v12, v11, 1, v0.t
1765
- ; ZIP-NEXT: vmv.v.v v0, v17
1766
- ; ZIP-NEXT: vmerge.vvm v13, v20, v12, v0
1767
- ; ZIP-NEXT: vmv.v.v v0, v16
1768
- ; ZIP-NEXT: vslideup.vi v18, v9, 1, v0.t
1769
- ; ZIP-NEXT: vmv.v.v v0, v17
1770
- ; ZIP-NEXT: vmerge.vvm v12, v10, v18, v0
1771
- ; ZIP-NEXT: vmv.s.x v0, a0
1772
1729
; ZIP-NEXT: vsetivli zero, 16, e64, m4, ta, ma
1773
- ; ZIP-NEXT: vmerge.vvm v8, v12, v12, v0
1730
+ ; ZIP-NEXT: ri.vunzip2a.vv v16, v8, v12
1731
+ ; ZIP-NEXT: vmv.v.v v8, v16
1774
1732
; ZIP-NEXT: ret
1775
1733
entry:
1776
1734
%c = shufflevector <16 x i64 > %a , <16 x i64 > %b , <16 x i32 > <i32 0 , i32 2 , i32 4 , i32 6 , i32 8 , i32 10 , i32 12 , i32 14 , i32 16 , i32 18 , i32 20 , i32 22 , i32 24 , i32 26 , i32 28 , i32 30 >
1777
1735
ret <16 x i64 > %c
1778
1736
}
1737
+
1738
+ define <4 x i64 > @unzip2b_dual_v4i64 (<4 x i64 > %a , <4 x i64 > %b ) {
1739
+ ; V-LABEL: unzip2b_dual_v4i64:
1740
+ ; V: # %bb.0: # %entry
1741
+ ; V-NEXT: vsetivli zero, 4, e64, m1, ta, mu
1742
+ ; V-NEXT: vmv.v.i v0, 2
1743
+ ; V-NEXT: vslidedown.vi v10, v8, 1
1744
+ ; V-NEXT: vslidedown.vi v10, v8, 2, v0.t
1745
+ ; V-NEXT: vmv.v.i v0, 4
1746
+ ; V-NEXT: vmv1r.v v8, v9
1747
+ ; V-NEXT: vslideup.vi v8, v9, 1, v0.t
1748
+ ; V-NEXT: vmv.v.i v0, 12
1749
+ ; V-NEXT: vmerge.vvm v8, v10, v8, v0
1750
+ ; V-NEXT: ret
1751
+ ;
1752
+ ; ZVE32F-LABEL: unzip2b_dual_v4i64:
1753
+ ; ZVE32F: # %bb.0: # %entry
1754
+ ; ZVE32F-NEXT: ld a3, 8(a2)
1755
+ ; ZVE32F-NEXT: ld a2, 24(a2)
1756
+ ; ZVE32F-NEXT: ld a4, 8(a1)
1757
+ ; ZVE32F-NEXT: ld a1, 24(a1)
1758
+ ; ZVE32F-NEXT: vsetivli zero, 8, e32, m1, ta, mu
1759
+ ; ZVE32F-NEXT: vmv.v.i v0, 15
1760
+ ; ZVE32F-NEXT: srli a5, a2, 32
1761
+ ; ZVE32F-NEXT: srli a6, a3, 32
1762
+ ; ZVE32F-NEXT: srli a7, a1, 32
1763
+ ; ZVE32F-NEXT: srli t0, a4, 32
1764
+ ; ZVE32F-NEXT: vmv.v.x v8, a4
1765
+ ; ZVE32F-NEXT: vmv.v.x v9, a3
1766
+ ; ZVE32F-NEXT: vslide1down.vx v8, v8, t0
1767
+ ; ZVE32F-NEXT: vslide1down.vx v9, v9, a6
1768
+ ; ZVE32F-NEXT: vslide1down.vx v8, v8, a1
1769
+ ; ZVE32F-NEXT: vslide1down.vx v9, v9, a2
1770
+ ; ZVE32F-NEXT: vslide1down.vx v8, v8, a7
1771
+ ; ZVE32F-NEXT: vslide1down.vx v9, v9, a5
1772
+ ; ZVE32F-NEXT: vslidedown.vi v9, v8, 4, v0.t
1773
+ ; ZVE32F-NEXT: vse32.v v9, (a0)
1774
+ ; ZVE32F-NEXT: ret
1775
+ ;
1776
+ ; ZIP-LABEL: unzip2b_dual_v4i64:
1777
+ ; ZIP: # %bb.0: # %entry
1778
+ ; ZIP-NEXT: vsetivli zero, 4, e64, m1, ta, ma
1779
+ ; ZIP-NEXT: ri.vunzip2b.vv v11, v9, v10
1780
+ ; ZIP-NEXT: ri.vunzip2b.vv v9, v8, v10
1781
+ ; ZIP-NEXT: vslideup.vi v9, v11, 2
1782
+ ; ZIP-NEXT: vmv.v.v v8, v9
1783
+ ; ZIP-NEXT: ret
1784
+ entry:
1785
+ %c = shufflevector <4 x i64 > %a , <4 x i64 > %b , <4 x i32 > <i32 1 , i32 3 , i32 5 , i32 7 >
1786
+ ret <4 x i64 > %c
1787
+ }
1788
+
1789
+ define <4 x i64 > @unzip2b_dual_v4i64_exact (<4 x i64 > %a , <4 x i64 > %b ) vscale_range(4 ,4 ) {
1790
+ ; V-LABEL: unzip2b_dual_v4i64_exact:
1791
+ ; V: # %bb.0: # %entry
1792
+ ; V-NEXT: vsetivli zero, 4, e64, m1, ta, mu
1793
+ ; V-NEXT: vmv.v.i v0, 2
1794
+ ; V-NEXT: vslidedown.vi v10, v8, 1
1795
+ ; V-NEXT: vslidedown.vi v10, v8, 2, v0.t
1796
+ ; V-NEXT: vmv.v.i v0, 4
1797
+ ; V-NEXT: vmv1r.v v8, v9
1798
+ ; V-NEXT: vslideup.vi v8, v9, 1, v0.t
1799
+ ; V-NEXT: vmv.v.i v0, 12
1800
+ ; V-NEXT: vmerge.vvm v8, v10, v8, v0
1801
+ ; V-NEXT: ret
1802
+ ;
1803
+ ; ZVE32F-LABEL: unzip2b_dual_v4i64_exact:
1804
+ ; ZVE32F: # %bb.0: # %entry
1805
+ ; ZVE32F-NEXT: ld a3, 8(a2)
1806
+ ; ZVE32F-NEXT: ld a2, 24(a2)
1807
+ ; ZVE32F-NEXT: ld a4, 8(a1)
1808
+ ; ZVE32F-NEXT: ld a1, 24(a1)
1809
+ ; ZVE32F-NEXT: vsetivli zero, 8, e32, m1, ta, mu
1810
+ ; ZVE32F-NEXT: vmv.v.i v0, 15
1811
+ ; ZVE32F-NEXT: srli a5, a2, 32
1812
+ ; ZVE32F-NEXT: srli a6, a3, 32
1813
+ ; ZVE32F-NEXT: srli a7, a1, 32
1814
+ ; ZVE32F-NEXT: srli t0, a4, 32
1815
+ ; ZVE32F-NEXT: vmv.v.x v8, a4
1816
+ ; ZVE32F-NEXT: vmv.v.x v9, a3
1817
+ ; ZVE32F-NEXT: vslide1down.vx v8, v8, t0
1818
+ ; ZVE32F-NEXT: vslide1down.vx v9, v9, a6
1819
+ ; ZVE32F-NEXT: vslide1down.vx v8, v8, a1
1820
+ ; ZVE32F-NEXT: vslide1down.vx v9, v9, a2
1821
+ ; ZVE32F-NEXT: vslide1down.vx v8, v8, a7
1822
+ ; ZVE32F-NEXT: vslide1down.vx v9, v9, a5
1823
+ ; ZVE32F-NEXT: vslidedown.vi v9, v8, 4, v0.t
1824
+ ; ZVE32F-NEXT: vs1r.v v9, (a0)
1825
+ ; ZVE32F-NEXT: ret
1826
+ ;
1827
+ ; ZIP-LABEL: unzip2b_dual_v4i64_exact:
1828
+ ; ZIP: # %bb.0: # %entry
1829
+ ; ZIP-NEXT: vsetivli zero, 4, e64, m1, ta, ma
1830
+ ; ZIP-NEXT: ri.vunzip2b.vv v10, v8, v9
1831
+ ; ZIP-NEXT: vmv.v.v v8, v10
1832
+ ; ZIP-NEXT: ret
1833
+ entry:
1834
+ %c = shufflevector <4 x i64 > %a , <4 x i64 > %b , <4 x i32 > <i32 1 , i32 3 , i32 5 , i32 7 >
1835
+ ret <4 x i64 > %c
1836
+ }
0 commit comments