@@ -1384,11 +1384,10 @@ define <4 x float> @test_fmaximum_v4f32_splat(<4 x float> %x, float %y) {
1384
1384
ret <4 x float > %r
1385
1385
}
1386
1386
1387
- define <4 x half > @test_fmaximum_v4f16 (<4 x half > %x , <4 x half > %y ) {
1387
+ define <4 x half > @test_fmaximum_v4f16 (<4 x half > %x , <4 x half > %y ) nounwind {
1388
1388
; SSE2-LABEL: test_fmaximum_v4f16:
1389
1389
; SSE2: # %bb.0:
1390
1390
; SSE2-NEXT: subq $104, %rsp
1391
- ; SSE2-NEXT: .cfi_def_cfa_offset 112
1392
1391
; SSE2-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
1393
1392
; SSE2-NEXT: psrld $16, %xmm0
1394
1393
; SSE2-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
@@ -1524,13 +1523,11 @@ define <4 x half> @test_fmaximum_v4f16(<4 x half> %x, <4 x half> %y) {
1524
1523
; SSE2-NEXT: # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3]
1525
1524
; SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
1526
1525
; SSE2-NEXT: addq $104, %rsp
1527
- ; SSE2-NEXT: .cfi_def_cfa_offset 8
1528
1526
; SSE2-NEXT: retq
1529
1527
;
1530
1528
; AVX1-LABEL: test_fmaximum_v4f16:
1531
1529
; AVX1: # %bb.0:
1532
1530
; AVX1-NEXT: subq $120, %rsp
1533
- ; AVX1-NEXT: .cfi_def_cfa_offset 128
1534
1531
; AVX1-NEXT: vmovaps %xmm0, %xmm2
1535
1532
; AVX1-NEXT: vmovshdup {{.*#+}} xmm0 = xmm0[1,1,3,3]
1536
1533
; AVX1-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
@@ -1636,37 +1633,179 @@ define <4 x half> @test_fmaximum_v4f16(<4 x half> %x, <4 x half> %y) {
1636
1633
; AVX1-NEXT: # xmm1 = xmm1[0],mem[0],xmm1[1],mem[1],xmm1[2],mem[2],xmm1[3],mem[3]
1637
1634
; AVX1-NEXT: vinsertps {{.*#+}} xmm0 = xmm1[0],xmm0[0],zero,zero
1638
1635
; AVX1-NEXT: addq $120, %rsp
1639
- ; AVX1-NEXT: .cfi_def_cfa_offset 8
1640
1636
; AVX1-NEXT: retq
1641
1637
;
1642
1638
; AVX512-LABEL: test_fmaximum_v4f16:
1643
1639
; AVX512: # %bb.0:
1644
- ; AVX512-NEXT: vcvtph2ps %xmm0, %ymm2
1645
- ; AVX512-NEXT: vcvtph2ps %xmm1, %ymm3
1646
- ; AVX512-NEXT: vcmpltps %ymm2, %ymm3, %ymm4
1647
- ; AVX512-NEXT: vpmovdw %zmm4, %ymm4
1648
- ; AVX512-NEXT: vpblendvb %xmm4, %xmm0, %xmm1, %xmm4
1649
- ; AVX512-NEXT: vcmpunordps %ymm3, %ymm2, %ymm2
1650
- ; AVX512-NEXT: vpmovdw %zmm2, %ymm2
1651
- ; AVX512-NEXT: vpbroadcastw {{.*#+}} xmm3 = [NaN,NaN,NaN,NaN,NaN,NaN,NaN,NaN]
1652
- ; AVX512-NEXT: vpblendvb %xmm2, %xmm3, %xmm4, %xmm2
1653
- ; AVX512-NEXT: vpxor %xmm3, %xmm3, %xmm3
1654
- ; AVX512-NEXT: vpcmpeqw %xmm3, %xmm0, %xmm4
1655
- ; AVX512-NEXT: vpblendvb %xmm4, %xmm0, %xmm2, %xmm0
1656
- ; AVX512-NEXT: vpcmpeqw %xmm3, %xmm1, %xmm3
1657
- ; AVX512-NEXT: vpblendvb %xmm3, %xmm1, %xmm0, %xmm0
1658
- ; AVX512-NEXT: vcvtph2ps %xmm2, %ymm1
1659
- ; AVX512-NEXT: vpxor %xmm3, %xmm3, %xmm3
1660
- ; AVX512-NEXT: vcmpeqps %ymm3, %ymm1, %ymm1
1661
- ; AVX512-NEXT: vpmovdw %zmm1, %ymm1
1662
- ; AVX512-NEXT: vpblendvb %xmm1, %xmm0, %xmm2, %xmm0
1663
- ; AVX512-NEXT: vzeroupper
1640
+ ; AVX512-NEXT: pushq %rbp
1641
+ ; AVX512-NEXT: pushq %r15
1642
+ ; AVX512-NEXT: pushq %r14
1643
+ ; AVX512-NEXT: pushq %r13
1644
+ ; AVX512-NEXT: pushq %r12
1645
+ ; AVX512-NEXT: pushq %rbx
1646
+ ; AVX512-NEXT: vpshufd {{.*#+}} xmm2 = xmm1[3,3,3,3]
1647
+ ; AVX512-NEXT: vcvtph2ps %xmm2, %xmm2
1648
+ ; AVX512-NEXT: vpshufd {{.*#+}} xmm3 = xmm0[3,3,3,3]
1649
+ ; AVX512-NEXT: vcvtph2ps %xmm3, %xmm3
1650
+ ; AVX512-NEXT: xorl %eax, %eax
1651
+ ; AVX512-NEXT: vucomiss %xmm2, %xmm3
1652
+ ; AVX512-NEXT: movl $65535, %ecx # imm = 0xFFFF
1653
+ ; AVX512-NEXT: movl $0, %edx
1654
+ ; AVX512-NEXT: cmovpl %ecx, %edx
1655
+ ; AVX512-NEXT: movl $0, %edi
1656
+ ; AVX512-NEXT: cmoval %ecx, %edi
1657
+ ; AVX512-NEXT: vpsrldq {{.*#+}} xmm2 = xmm1[10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
1658
+ ; AVX512-NEXT: vcvtph2ps %xmm2, %xmm2
1659
+ ; AVX512-NEXT: vpsrldq {{.*#+}} xmm3 = xmm0[10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
1660
+ ; AVX512-NEXT: vcvtph2ps %xmm3, %xmm3
1661
+ ; AVX512-NEXT: vucomiss %xmm2, %xmm3
1662
+ ; AVX512-NEXT: movl $0, %esi
1663
+ ; AVX512-NEXT: cmovpl %ecx, %esi
1664
+ ; AVX512-NEXT: movl $0, %r9d
1665
+ ; AVX512-NEXT: cmoval %ecx, %r9d
1666
+ ; AVX512-NEXT: vshufpd {{.*#+}} xmm2 = xmm1[1,0]
1667
+ ; AVX512-NEXT: vcvtph2ps %xmm2, %xmm2
1668
+ ; AVX512-NEXT: vshufpd {{.*#+}} xmm3 = xmm0[1,0]
1669
+ ; AVX512-NEXT: vcvtph2ps %xmm3, %xmm3
1670
+ ; AVX512-NEXT: vucomiss %xmm2, %xmm3
1671
+ ; AVX512-NEXT: movl $0, %r8d
1672
+ ; AVX512-NEXT: cmovpl %ecx, %r8d
1673
+ ; AVX512-NEXT: movl $0, %r11d
1674
+ ; AVX512-NEXT: cmoval %ecx, %r11d
1675
+ ; AVX512-NEXT: vpshuflw {{.*#+}} xmm2 = xmm1[3,3,3,3,4,5,6,7]
1676
+ ; AVX512-NEXT: vcvtph2ps %xmm2, %xmm2
1677
+ ; AVX512-NEXT: vpshuflw {{.*#+}} xmm3 = xmm0[3,3,3,3,4,5,6,7]
1678
+ ; AVX512-NEXT: vcvtph2ps %xmm3, %xmm3
1679
+ ; AVX512-NEXT: vucomiss %xmm2, %xmm3
1680
+ ; AVX512-NEXT: movl $0, %r10d
1681
+ ; AVX512-NEXT: cmovpl %ecx, %r10d
1682
+ ; AVX512-NEXT: movl $0, %ebp
1683
+ ; AVX512-NEXT: cmoval %ecx, %ebp
1684
+ ; AVX512-NEXT: vmovshdup {{.*#+}} xmm2 = xmm1[1,1,3,3]
1685
+ ; AVX512-NEXT: vcvtph2ps %xmm2, %xmm2
1686
+ ; AVX512-NEXT: vmovshdup {{.*#+}} xmm3 = xmm0[1,1,3,3]
1687
+ ; AVX512-NEXT: vcvtph2ps %xmm3, %xmm3
1688
+ ; AVX512-NEXT: vucomiss %xmm2, %xmm3
1689
+ ; AVX512-NEXT: movl $0, %ebx
1690
+ ; AVX512-NEXT: cmovpl %ecx, %ebx
1691
+ ; AVX512-NEXT: movl $0, %r14d
1692
+ ; AVX512-NEXT: cmoval %ecx, %r14d
1693
+ ; AVX512-NEXT: vpshuflw {{.*#+}} xmm2 = xmm1[1,1,1,1,4,5,6,7]
1694
+ ; AVX512-NEXT: vcvtph2ps %xmm2, %xmm2
1695
+ ; AVX512-NEXT: vpshuflw {{.*#+}} xmm3 = xmm0[1,1,1,1,4,5,6,7]
1696
+ ; AVX512-NEXT: vcvtph2ps %xmm3, %xmm3
1697
+ ; AVX512-NEXT: vucomiss %xmm2, %xmm3
1698
+ ; AVX512-NEXT: movl $0, %r15d
1699
+ ; AVX512-NEXT: cmovpl %ecx, %r15d
1700
+ ; AVX512-NEXT: movl $0, %r12d
1701
+ ; AVX512-NEXT: cmoval %ecx, %r12d
1702
+ ; AVX512-NEXT: vcvtph2ps %xmm1, %xmm2
1703
+ ; AVX512-NEXT: vcvtph2ps %xmm0, %xmm3
1704
+ ; AVX512-NEXT: vucomiss %xmm2, %xmm3
1705
+ ; AVX512-NEXT: movl $0, %r13d
1706
+ ; AVX512-NEXT: cmoval %ecx, %r13d
1707
+ ; AVX512-NEXT: vmovd %r13d, %xmm2
1708
+ ; AVX512-NEXT: vpinsrw $1, %r12d, %xmm2, %xmm2
1709
+ ; AVX512-NEXT: vpinsrw $2, %r14d, %xmm2, %xmm2
1710
+ ; AVX512-NEXT: vpinsrw $3, %ebp, %xmm2, %xmm2
1711
+ ; AVX512-NEXT: vpinsrw $4, %r11d, %xmm2, %xmm2
1712
+ ; AVX512-NEXT: vpinsrw $5, %r9d, %xmm2, %xmm2
1713
+ ; AVX512-NEXT: vpinsrw $6, %edi, %xmm2, %xmm2
1714
+ ; AVX512-NEXT: movl $0, %edi
1715
+ ; AVX512-NEXT: cmovpl %ecx, %edi
1716
+ ; AVX512-NEXT: vpsrldq {{.*#+}} xmm3 = xmm1[14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
1717
+ ; AVX512-NEXT: vcvtph2ps %xmm3, %xmm3
1718
+ ; AVX512-NEXT: vpsrldq {{.*#+}} xmm4 = xmm0[14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
1719
+ ; AVX512-NEXT: vcvtph2ps %xmm4, %xmm4
1720
+ ; AVX512-NEXT: vucomiss %xmm3, %xmm4
1721
+ ; AVX512-NEXT: movl $0, %r9d
1722
+ ; AVX512-NEXT: cmoval %ecx, %r9d
1723
+ ; AVX512-NEXT: vpinsrw $7, %r9d, %xmm2, %xmm2
1724
+ ; AVX512-NEXT: vpblendvb %xmm2, %xmm0, %xmm1, %xmm2
1725
+ ; AVX512-NEXT: vmovd %edi, %xmm3
1726
+ ; AVX512-NEXT: vpinsrw $1, %r15d, %xmm3, %xmm3
1727
+ ; AVX512-NEXT: vpinsrw $2, %ebx, %xmm3, %xmm3
1728
+ ; AVX512-NEXT: vpinsrw $3, %r10d, %xmm3, %xmm3
1729
+ ; AVX512-NEXT: vpinsrw $4, %r8d, %xmm3, %xmm3
1730
+ ; AVX512-NEXT: vpinsrw $5, %esi, %xmm3, %xmm3
1731
+ ; AVX512-NEXT: vpinsrw $6, %edx, %xmm3, %xmm3
1732
+ ; AVX512-NEXT: movl $0, %edx
1733
+ ; AVX512-NEXT: cmovpl %ecx, %edx
1734
+ ; AVX512-NEXT: vpinsrw $7, %edx, %xmm3, %xmm3
1735
+ ; AVX512-NEXT: vpbroadcastw {{.*#+}} xmm4 = [NaN,NaN,NaN,NaN,NaN,NaN,NaN,NaN]
1736
+ ; AVX512-NEXT: vpblendvb %xmm3, %xmm4, %xmm2, %xmm2
1737
+ ; AVX512-NEXT: vpshuflw {{.*#+}} xmm3 = xmm2[1,1,1,1,4,5,6,7]
1738
+ ; AVX512-NEXT: vcvtph2ps %xmm3, %xmm3
1739
+ ; AVX512-NEXT: vpxor %xmm4, %xmm4, %xmm4
1740
+ ; AVX512-NEXT: vucomiss %xmm4, %xmm3
1741
+ ; AVX512-NEXT: movl $65535, %edx # imm = 0xFFFF
1742
+ ; AVX512-NEXT: cmovnel %eax, %edx
1743
+ ; AVX512-NEXT: cmovpl %eax, %edx
1744
+ ; AVX512-NEXT: vcvtph2ps %xmm2, %xmm3
1745
+ ; AVX512-NEXT: vucomiss %xmm4, %xmm3
1746
+ ; AVX512-NEXT: movl $65535, %esi # imm = 0xFFFF
1747
+ ; AVX512-NEXT: cmovnel %eax, %esi
1748
+ ; AVX512-NEXT: cmovpl %eax, %esi
1749
+ ; AVX512-NEXT: vmovd %esi, %xmm3
1750
+ ; AVX512-NEXT: vpinsrw $1, %edx, %xmm3, %xmm3
1751
+ ; AVX512-NEXT: vpshufd {{.*#+}} xmm5 = xmm2[1,1,1,1]
1752
+ ; AVX512-NEXT: vcvtph2ps %xmm5, %xmm5
1753
+ ; AVX512-NEXT: vucomiss %xmm4, %xmm5
1754
+ ; AVX512-NEXT: movl $65535, %edx # imm = 0xFFFF
1755
+ ; AVX512-NEXT: cmovnel %eax, %edx
1756
+ ; AVX512-NEXT: cmovpl %eax, %edx
1757
+ ; AVX512-NEXT: vpinsrw $2, %edx, %xmm3, %xmm3
1758
+ ; AVX512-NEXT: vpshuflw {{.*#+}} xmm5 = xmm2[3,3,3,3,4,5,6,7]
1759
+ ; AVX512-NEXT: vcvtph2ps %xmm5, %xmm5
1760
+ ; AVX512-NEXT: vucomiss %xmm4, %xmm5
1761
+ ; AVX512-NEXT: movl $65535, %edx # imm = 0xFFFF
1762
+ ; AVX512-NEXT: cmovnel %eax, %edx
1763
+ ; AVX512-NEXT: cmovpl %eax, %edx
1764
+ ; AVX512-NEXT: vpinsrw $3, %edx, %xmm3, %xmm3
1765
+ ; AVX512-NEXT: vpshufd {{.*#+}} xmm5 = xmm2[2,3,2,3]
1766
+ ; AVX512-NEXT: vcvtph2ps %xmm5, %xmm5
1767
+ ; AVX512-NEXT: vucomiss %xmm4, %xmm5
1768
+ ; AVX512-NEXT: movl $65535, %edx # imm = 0xFFFF
1769
+ ; AVX512-NEXT: cmovnel %eax, %edx
1770
+ ; AVX512-NEXT: cmovpl %eax, %edx
1771
+ ; AVX512-NEXT: vpinsrw $4, %edx, %xmm3, %xmm3
1772
+ ; AVX512-NEXT: vpsrldq {{.*#+}} xmm5 = xmm2[10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
1773
+ ; AVX512-NEXT: vcvtph2ps %xmm5, %xmm5
1774
+ ; AVX512-NEXT: vucomiss %xmm4, %xmm5
1775
+ ; AVX512-NEXT: movl $65535, %edx # imm = 0xFFFF
1776
+ ; AVX512-NEXT: cmovnel %eax, %edx
1777
+ ; AVX512-NEXT: cmovpl %eax, %edx
1778
+ ; AVX512-NEXT: vpinsrw $5, %edx, %xmm3, %xmm3
1779
+ ; AVX512-NEXT: vpshufd {{.*#+}} xmm5 = xmm2[3,3,3,3]
1780
+ ; AVX512-NEXT: vcvtph2ps %xmm5, %xmm5
1781
+ ; AVX512-NEXT: vucomiss %xmm4, %xmm5
1782
+ ; AVX512-NEXT: movl $65535, %edx # imm = 0xFFFF
1783
+ ; AVX512-NEXT: cmovnel %eax, %edx
1784
+ ; AVX512-NEXT: cmovpl %eax, %edx
1785
+ ; AVX512-NEXT: vpinsrw $6, %edx, %xmm3, %xmm3
1786
+ ; AVX512-NEXT: vpsrldq {{.*#+}} xmm5 = xmm2[14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
1787
+ ; AVX512-NEXT: vcvtph2ps %xmm5, %xmm5
1788
+ ; AVX512-NEXT: vucomiss %xmm4, %xmm5
1789
+ ; AVX512-NEXT: cmovnel %eax, %ecx
1790
+ ; AVX512-NEXT: cmovpl %eax, %ecx
1791
+ ; AVX512-NEXT: vpinsrw $7, %ecx, %xmm3, %xmm3
1792
+ ; AVX512-NEXT: vpxor %xmm4, %xmm4, %xmm4
1793
+ ; AVX512-NEXT: vpcmpeqw %xmm4, %xmm0, %xmm5
1794
+ ; AVX512-NEXT: vpblendvb %xmm5, %xmm0, %xmm2, %xmm0
1795
+ ; AVX512-NEXT: vpcmpeqw %xmm4, %xmm1, %xmm4
1796
+ ; AVX512-NEXT: vpblendvb %xmm4, %xmm1, %xmm0, %xmm0
1797
+ ; AVX512-NEXT: vpblendvb %xmm3, %xmm0, %xmm2, %xmm0
1798
+ ; AVX512-NEXT: popq %rbx
1799
+ ; AVX512-NEXT: popq %r12
1800
+ ; AVX512-NEXT: popq %r13
1801
+ ; AVX512-NEXT: popq %r14
1802
+ ; AVX512-NEXT: popq %r15
1803
+ ; AVX512-NEXT: popq %rbp
1664
1804
; AVX512-NEXT: retq
1665
1805
;
1666
1806
; X86-LABEL: test_fmaximum_v4f16:
1667
1807
; X86: # %bb.0:
1668
1808
; X86-NEXT: subl $164, %esp
1669
- ; X86-NEXT: .cfi_def_cfa_offset 168
1670
1809
; X86-NEXT: vmovdqa %xmm0, %xmm2
1671
1810
; X86-NEXT: vmovdqu %xmm0, {{[-0-9]+}}(%e{{[sb]}}p) # 16-byte Spill
1672
1811
; X86-NEXT: vpsrlq $48, %xmm0, %xmm0
@@ -1806,7 +1945,6 @@ define <4 x half> @test_fmaximum_v4f16(<4 x half> %x, <4 x half> %y) {
1806
1945
; X86-NEXT: # xmm1 = xmm1[0],mem[0],xmm1[1],mem[1],xmm1[2],mem[2],xmm1[3],mem[3]
1807
1946
; X86-NEXT: vinsertps {{.*#+}} xmm0 = xmm1[0],xmm0[0],zero,zero
1808
1947
; X86-NEXT: addl $164, %esp
1809
- ; X86-NEXT: .cfi_def_cfa_offset 4
1810
1948
; X86-NEXT: retl
1811
1949
%r = call <4 x half > @llvm.maximum.v4f16 (<4 x half > %x , <4 x half > %y )
1812
1950
ret <4 x half > %r
0 commit comments